In [55]:
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from collections import defaultdict
from gensim import corpora, models
from nltk.corpus import stopwords
from pprint import pprint


In [56]:
data = pd.read_csv("data_cleaned/combined.csv")
data.head()

Unnamed: 0,id,title,format,affiliation 1,affiliation 2,affiliation 3,affiliation 4,affiliation 5,affiliation 6,affiliation 7,...,Abstract,lower_abstract,affiliation,mode of presentation,affiliation 8,affiliation 9,affiliation 10,tags (primary),tags (secondary),affiliation 11
0,2013-1,Increasing the Effectiveness of Your Scholarly...,preconference workshop,University of Utah,Tecker International,,,,,,...,This workshop is designed for those that are b...,this workshop is designed for those that are b...,,,,,,,,
1,2013-2,"Planning, Assessing, and Communicating Library...",preconference workshop,University of Illinois at Urbana-Champaign,Pierce College,,,,,,...,Libraries in higher education are increasingly...,libraries in higher education are increasingly...,,,,,,,,
2,2013-3,"Plugged into User Behavior: Low-Budget, High-I...",preconference workshop,University of Texas Arlington,,,,,,,...,How do students really use library subject gui...,how do students really use library subject gui...,,,,,,,,
3,2013-4,Rediscover the Joy of Working Together!: Posit...,preconference workshop,Kansas State University,Rockhurst University,University of Kansas,University of Iowa,,,,...,Join this collaborative session to enhance you...,join this collaborative session to enhance you...,,,,,,,,
4,2013-5,Speculation to Litigation—Copyright and Climat...,preconference workshop,University of Louisville,Columbia University,Brigham Young University,Purdue University,,,,...,Changes in digital technologies have produced ...,changes in digital technologies have produced ...,,,,,,,,


In [58]:
set(data["year"].tolist())

{2007, 2009, 2011, 2013, 2015, 2017, 2019, 2021}

In [62]:
documents = data["lower_abstract"].dropna().tolist()

In [77]:
stoplist = set(stopwords.words("english"))
stoplist.update(["library", "libraries"])
print(stoplist)

{'having', 'whom', 'hers', 'above', 'doesn', "don't", 'is', 'being', 'herself', 'no', "aren't", 'was', "it's", 't', 'by', 'needn', 'those', 'mustn', 'theirs', 'libraries', 'against', 'after', 'we', 'i', 'be', 'should', 'ain', 'few', 'because', 'more', 'here', 're', 'm', 's', "you've", 'wouldn', 'wasn', 'd', 'as', 'mightn', "should've", "couldn't", "haven't", 'haven', 'what', 'very', 'an', "that'll", 'own', 'he', "you're", "wouldn't", 'has', 'they', 'of', 'in', 'didn', 'himself', 'further', "doesn't", 'were', 'these', 'from', 'them', 'between', 'her', 'ours', 'with', 'over', 'such', 'not', 'don', 'a', 'same', 'o', 'hadn', "you'll", 'up', 'all', 'themselves', 'any', 'did', 'y', 'me', 'each', 'won', 'its', 'other', 'during', 'this', 'only', 'does', "you'd", 'most', 'll', "weren't", 'which', 'aren', 'off', "needn't", 'had', 'will', 'out', 'you', 'weren', 'who', 'how', 'why', 'at', 'some', 'their', 'she', 'have', 'when', "wasn't", 'yourself', "shan't", 'do', "shouldn't", 'isn', "mightn't", 

In [78]:
texts = [[word for word in document.lower().split() if word not in stoplist]
            for document in documents]

In [79]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [80]:
texts = [[token for token in text if frequency[token] > 1]
            for text in texts]

In [81]:
pprint(texts[:1])

[['workshop',
  'designed',
  'building',
  'programs',
  'institutions',
  'need',
  'strengthen',
  'revitalize',
  'local',
  'efforts.',
  'emphasis',
  'placed',
  'think',
  'strategically',
  'local',
  'programs',
  'develop',
  'skills',
  'influential',
  'without',
  'necessarily',
  'positional',
  'authority.',
  'workshop',
  'encourage',
  'participant',
  'interaction',
  'incorporating',
  'hands-on',
  'activities.']]


In [82]:
dictionary = corpora.Dictionary(texts)
dictionary.save('acrl.dict')

In [83]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('acrl.mm', corpus)

In [84]:
hdp = models.HdpModel(corpus, id2word=dictionary)

In [85]:
hdp.print_topics()

[(0,
  '0.008*librarians + 0.008*academic + 0.008*information + 0.007*students + 0.007*research + 0.005*learning + 0.005*new + 0.005*learn + 0.004*student + 0.004*literacy'),
 (1,
  '0.003*research + 0.003*librarians + 0.003*students + 0.002*information + 0.002*services + 0.002*faculty + 0.002*academic + 0.002*student + 0.001*learn + 0.001*online'),
 (2,
  '0.003*information + 0.003*learning + 0.002*literacy + 0.002*students + 0.002*learn + 0.002*librarians + 0.002*university + 0.002*student + 0.002*research + 0.002*academic'),
 (3,
  '0.002*librarians + 0.002*learning + 0.002*academic + 0.002*research + 0.002*information + 0.002*learn + 0.002*data + 0.002*student + 0.002*session + 0.001*open'),
 (4,
  '0.003*information + 0.002*research + 0.002*librarians + 0.002*literacy + 0.002*new + 0.002*students + 0.002*academic + 0.002*learn + 0.001*faculty + 0.001*use'),
 (5,
  '0.002*students + 0.002*librarians + 0.002*academic + 0.002*research + 0.001*information + 0.001*college + 0.001*liter

In [86]:
pyLDAvis.enable_notebook()

In [87]:
gensimvis.prepare(hdp, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [88]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=30)

In [89]:
lda.print_topics()

[(25,
  '0.009*"session" + 0.009*"beginners" + 0.006*"academic" + 0.006*"rights" + 0.006*"passing" + 0.006*"including" + 0.006*"learn" + 0.006*"work" + 0.005*"librarians" + 0.005*"collections"'),
 (18,
  '0.020*"learning" + 0.013*"librarians" + 0.012*"data" + 0.010*"research" + 0.009*"information" + 0.009*"learn" + 0.007*"literacy" + 0.007*"academic" + 0.007*"students" + 0.006*"campus"'),
 (8,
  '0.016*"copyright" + 0.013*"digital" + 0.009*"escape" + 0.007*"projects" + 0.007*"librarians" + 0.006*"information" + 0.006*"fair" + 0.006*"diversity," + 0.006*"work" + 0.006*"learning"'),
 (28,
  '0.057*"abstract" + 0.012*"research" + 0.010*"new" + 0.009*"academic" + 0.007*"university" + 0.007*"information" + 0.007*"students" + 0.005*"digital" + 0.005*"program" + 0.005*"services"'),
 (7,
  '0.020*"open" + 0.013*"new" + 0.009*"access" + 0.008*"librarians" + 0.007*"scholarly" + 0.007*"research" + 0.006*"work" + 0.006*"academic" + 0.006*"share" + 0.005*"learn"'),
 (13,
  '0.116*"abstract" + 0.008

In [91]:
gensimvis.prepare(lda, corpus, dictionary)

  default_term_info = default_term_info.sort_values(
