In [27]:
import pandas as pd
from gensim import corpora, models

from collections import defaultdict
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
data = pd.read_csv("data_cleaned/combined.csv")
data.head()

Unnamed: 0,id,title,format,affiliation 1,affiliation 2,affiliation 3,affiliation 4,affiliation 5,affiliation 6,affiliation 7,affiliation 8,affiliation 9,affiliation 10,tags,year,ID,Abstract,lower_abstract
0,2017-1,Assessing and Communicating Library Contributi...,preconference,University of Illinois at Urbana-Champaign,Loyola Marymount University,Anne Arundel Community College,Dominican University,,,,,,,Assessment,2017,2017-1,Higher education institutions of all types are...,higher education institutions of all types are...
1,2017-2,COUNTER Bootcamp: A Workshop about COUNTER Rep...,preconference,Virtual Library of Virginia,University of Richmond,EBSCO Information Services,Mongomery College,,,,,,,Assessment,2017,2017-2,Take a deep dive into the process and workflow...,take a deep dive into the process and workflow...
2,2017-3,Information Literacy Instruction Transformed: ...,preconference,Towson University,University of North Carolina Greensboro,,,,,,,,,Teaching and Learning,2017,2017-3,Universal Design for Learning (UDL) offers ins...,universal design for learning (udl) offers ins...
3,2017-4,Law School for Librarians: A Tangled Web of Co...,preconference,Purdue University,University of Louisville,,,,,,,,,Scholarly Communication,2017,2017-4,Librarians encounter legal and policy issues n...,librarians encounter legal and policy issues n...
4,2017-5,"Make It, Map It, Take It: Create Your Own Digi...",preconference,California State University Dominguez Hills,University of California Irvine,,,,,,,,,Teaching and Learning,2017,2017-5,"Go beyond Camtasia, bring your laptop, and lea...","go beyond camtasia, bring your laptop, and lea..."


In [3]:
documents = data["lower_abstract"].tolist()

In [5]:
stoplist = set('for a of the and to in'.split())

In [6]:
texts = [[word for word in document.lower().split() if word not in stoplist]
            for document in documents]

In [9]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [10]:
texts = [[token for token in text if frequency[token] > 1]
            for text in texts]

In [13]:
pprint(texts[:1])

[['higher',
  'education',
  'institutions',
  'all',
  'types',
  'are',
  'facing',
  'intensified',
  'attention',
  'assessment',
  'accountability',
  'issues.',
  'academic',
  'libraries',
  'are',
  'increasingly',
  'connecting',
  'with',
  'colleagues',
  'campus',
  'stakeholders',
  'design',
  'implement',
  'assessment',
  'that',
  'documents',
  'their',
  'contributions',
  'institutional',
  'priorities.',
  'this',
  'day-long',
  'workshop',
  'on',
  'strategic',
  'sustainable',
  'assessment,',
  'participants',
  'will',
  'identify',
  'institutional',
  'priorities',
  'campus',
  'partners,',
  'design',
  'an',
  'assessment',
  'project',
  'grounded',
  'action',
  'research,',
  'prepare',
  'plan',
  'communicating',
  'project',
  'results.',
  'this',
  'event',
  'is',
  'based',
  'on',
  'highly',
  'successful',
  'acrl',
  'assessment',
  'action',
  'program',
  'curriculum.']]


In [15]:
dictionary = corpora.Dictionary(texts)
dictionary.save('acrl.dict')

In [17]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('acrl.mm', corpus)

In [20]:
hdp = models.HdpModel(corpus, id2word=dictionary)

In [24]:
hdp.print_topics()

[(0,
  '0.004*necessary + 0.003*transition + 0.003*on + 0.003*students. + 0.003*states + 0.003*does + 0.003*topics, + 0.003*brief + 0.003*this + 0.003*themes'),
 (1,
  '0.004*this + 0.004*long-term + 0.004*insight + 0.003*inventory + 0.003*decision + 0.003*pedagogy. + 0.003*intensified + 0.003*assessment + 0.003*uniquely + 0.003*composting'),
 (2,
  '0.004*partners + 0.004*taken + 0.003*there + 0.003*problem-based + 0.003*archives + 0.003*supporting + 0.003*writing, + 0.003*innovations + 0.003*pilot + 0.003*liaisons'),
 (3,
  '0.004*valuable + 0.004*changing + 0.004*threshold + 0.003*site + 0.003*recently + 0.003*women + 0.003*uncover + 0.003*emphasizes + 0.003*surprising + 0.003*methods,'),
 (4,
  '0.006*kingdom. + 0.004*paid + 0.004*reduce + 0.004*administrative + 0.003*structural + 0.003*minutes + 0.003*built-in + 0.003*technologies + 0.003*environmental + 0.003*as'),
 (5,
  '0.004*engage + 0.004*score + 0.004*publisher, + 0.003*shift + 0.003*power + 0.003*percent + 0.003*oer + 0.00

In [28]:
pyLDAvis.enable_notebook()

In [29]:
gensimvis.prepare(hdp, corpus, dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [30]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=30)

In [31]:
lda.print_topics()

[(27,
  '0.018*"will" + 0.018*"we" + 0.016*"from" + 0.014*"sabotage" + 0.014*"behaviors" + 0.014*"organizational" + 0.014*"field" + 0.012*"this" + 0.012*"new" + 0.011*"by"'),
 (16,
  '0.015*"on" + 0.014*"is" + 0.014*"will" + 0.013*"library" + 0.012*"this" + 0.011*"with" + 0.011*"libraries" + 0.011*"open" + 0.009*"learning" + 0.009*"that"'),
 (21,
  '0.040*"no" + 0.040*"abstract" + 0.018*"how" + 0.015*"faculty" + 0.014*"are" + 0.012*"literacy" + 0.012*"learn" + 0.011*"program" + 0.011*"information" + 0.010*"this"'),
 (1,
  '0.022*"as" + 0.017*"will" + 0.017*"libraries" + 0.016*"diversity" + 0.016*"are" + 0.015*"this" + 0.014*"with" + 0.014*"how" + 0.010*"an" + 0.010*"about"'),
 (7,
  '0.023*"their" + 0.022*"will" + 0.015*"participants" + 0.012*"from" + 0.012*"how" + 0.012*"information" + 0.011*"environments." + 0.011*"learning" + 0.010*"design" + 0.010*"on"'),
 (26,
  '0.069*"abstract" + 0.069*"no" + 0.015*"information" + 0.014*"as" + 0.012*"literacy" + 0.011*"an" + 0.010*"libraries" + 

In [32]:
gensimvis.prepare(lda, corpus, dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
