# LIS 875 (Week 6): -- Topic Modeling


## 1 Set Up Environment in Google Colab

Run the following cells to install/upgrade the required packages and check if the installed versions meet the requirements.

In [None]:
# make sure the required python packages are installed

# install nltk (we'll use 3.6.7 in Fall 2022)
!pip install nltk==3.6.7 --upgrade

# install spacy (we'll use 3.2.1 in Fall 2022)
!pip install spacy==3.2.1 --upgrade

# install spacy (we'll use 4.1.2 in Fall 2022)
!pip install gensim==4.1.2 --upgrade

# download the spacy en_core_web_sm model (3.2.0 version)
!python -m spacy download en_core_web_sm-3.2.0 --direct

In [None]:
# Set up the work folder in Google Drive.
# Follow the prompt to authenticate your Google credentials.
import os 
from google.colab import drive

drive.mount('/content/drive/')
workdir = '/content/drive/MyDrive/LIS875 Fall22/week05'

 # change the workdir according to your work folder in your Google Drive

Mounted at /content/drive/


## 2 Prepare Data

In [None]:
# Load a spacy NLP pipeline (without dependency parsing and named entity recognition) -- 
# we won't use them and they would slow down the speed of processing the texts.
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
#check working dir
workdir

'/content/drive/MyDrive/LIS875 Fall22/week05'

In [None]:
# Let's load a dataset 'CHI.tsv' located in the work folder.
# This dataset included research articles published in the CHI conference (a top conference for human-computer interaction).
import pandas

data = pandas.read_csv(os.path.join(workdir, 'CHI.tsv'), sep='\t', header=0)

In [None]:
data.head()

Unnamed: 0,YEAR,TITLE,ABSTRACT
0,2000,Intelligent gaze-added interfaces,"We discuss a novel type of interface, the inte..."
1,2000,Evaluation of eye gaze interaction,Eye gaze interaction can provide a convenient ...
2,2000,Enriching buyers' experiences: the SmartClient...,"In electronic commerce, a satisfying buyer exp..."
3,2000,Quality is in the eye of the beholder: meeting...,Growing usage and diversity of applications on...
4,2000,What makes Internet users visit cyber stores a...,Retaining customer loyalty is crucial in elect...


In [None]:
# Let's concatenate the title and abstract together as the text information of an article.
# We will use the title and abstract information to train LDA topic models.
data['TEXT'] = data['TITLE'] + '. ' + data['ABSTRACT']
data

Unnamed: 0,YEAR,TITLE,ABSTRACT,TEXT
0,2000,Intelligent gaze-added interfaces,"We discuss a novel type of interface, the inte...",Intelligent gaze-added interfaces. We discuss ...
1,2000,Evaluation of eye gaze interaction,Eye gaze interaction can provide a convenient ...,Evaluation of eye gaze interaction. Eye gaze i...
2,2000,Enriching buyers' experiences: the SmartClient...,"In electronic commerce, a satisfying buyer exp...",Enriching buyers' experiences: the SmartClient...
3,2000,Quality is in the eye of the beholder: meeting...,Growing usage and diversity of applications on...,Quality is in the eye of the beholder: meeting...
4,2000,What makes Internet users visit cyber stores a...,Retaining customer loyalty is crucial in elect...,What makes Internet users visit cyber stores a...
...,...,...,...,...
4061,1999,Mutual disambiguation of recognition errors in...,As a new generation of multimodal/media system...,Mutual disambiguation of recognition errors in...
4062,1999,Model-based and empirical evaluation of multim...,Our research addresses the problem of error co...,Model-based and empirical evaluation of multim...
4063,1999,Cooperative inquiry: developing new technologi...,"In todays homes and schools, children are emer...",Cooperative inquiry: developing new technologi...
4064,1999,Projected realities: conceptual design for cul...,As a part of a European Union sponsored projec...,Projected realities: conceptual design for cul...


In [None]:
import spacy

# A text preprocessing function similar to what we did before.
# Process a raw text and returns a list of processed word tokens 
# (removes stopwords and punctuations, applies casefolding and stemming)
def text2words(rawtext, nlp):
  text = nlp(rawtext)
  return [token.lemma_.lower() for token in text if not token.is_stop and not token.is_punct]

In [None]:
# Process the whole corpus (over 4,000 articles). It may take a few minutes.
corpus = [ text2words(text, nlp) for text in data['TEXT']]

In [None]:
# Let's take a look at a processed article, which includes a list of word tokens.
corpus[0]

## 3 Train LDA Topic Models

In [None]:
import gensim
from gensim import corpora

# Let's extract a vocabulary from the corpus (all the unique words in the articles).
voc = corpora.Dictionary(corpus)

# Count bag-of-words models (will use word ids instead of texts).
corpusbow = [ voc.doc2bow(text) for text in corpus]

In [None]:
corpusbow[0]

In [None]:
# let's take a look at the bow results.
# each tuple include a word id and its frequency in the article
corpusbow[1]

In [None]:
import logging

# you can turn on the debug information such that you know your model is still being trained (it takes a while to train a model)
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )

# Some important parameters for training your LDA models.
# Make sure NUM_PASSES and NUM_ITERATIONS are large enough (very important).
NUM_TOPICS = 20 # the number of topics
NUM_PASSES = 10 # the number of passes to scan through the data; use a large number for a small corpus (such as our example)
NUM_ITERATIONS = 100 # the number of times to iterate each document in a single pass (the default number of iteration is 50)

# Training LDA

lda = gensim.models.ldamodel.LdaModel(
    corpusbow,
    id2word = voc,
    num_topics = NUM_TOPICS,
    passes = NUM_PASSES,
    iterations = NUM_ITERATIONS
)

In [None]:
# You can store the trained LDA topic models to your hard drive,
# such that you just need to load it next time you hope to use it.
# This is how we provide you with the pre-trained LDA topic models in your HW2.

from gensim.test.utils import datapath

path = datapath(os.path.join(workdir, 'HCI_topics'))
lda.save(path)

## 4 Access LDA Topic Models
*   Load pre-trained models from files
*   Access word probability for a topic: P(w|topic)
*   Access topic distribution for an article in the corpus: P(topic|$\theta_d$)
*   Infer topic distribution for a new article (not exist in the corpus)



In [None]:
# let's load a LDA topic model we have trained before.


# note that you may need to change the file path in the following two lines to your local file path
voc = corpora.Dictionary.load(os.path.join(workdir, 'HCI_topics.id2word')) # load the dictionary (note it is a file ends with .id2word)
lda = gensim.models.ldamulticore.LdaMulticore.load(os.path.join(workdir, 'HCI_topics')) # load the model

In [None]:
# show all the topics and the most important 20 words in each topic
lda.show_topics(num_topics=NUM_TOPICS, num_words=20)

[(0,
  '0.041*"user" + 0.031*"interface" + 0.025*"system" + 0.023*"design" + 0.022*"application" + 0.018*"tool" + 0.014*"datum" + 0.012*"information" + 0.011*"base" + 0.009*"context" + 0.008*"computer" + 0.008*"task" + 0.008*"approach" + 0.008*"support" + 0.007*"interactive" + 0.007*"develop" + 0.006*"use" + 0.006*"center" + 0.006*"build" + 0.006*"work"'),
 (1,
  '0.045*"design" + 0.025*"game" + 0.011*"experience" + 0.010*"study" + 0.009*"user" + 0.009*"hci" + 0.007*"paper" + 0.007*"value" + 0.007*"material" + 0.007*"present" + 0.006*"player" + 0.006*"practice" + 0.006*"finding" + 0.006*"provide" + 0.005*"explore" + 0.005*"field" + 0.005*"construction" + 0.005*"concept" + 0.005*"work" + 0.005*"context"'),
 (2,
  '0.062*"student" + 0.046*"use" + 0.043*"strategy" + 0.031*"strategic" + 0.027*"course" + 0.026*"efficient" + 0.025*"training" + 0.023*"teach" + 0.022*"learn" + 0.020*"cooperative" + 0.018*"recognize" + 0.018*"result" + 0.018*"bone" + 0.017*"study" + 0.016*"experiment" + 0.016*"

In [None]:
# check topic#1 and show the most important 100 words
lda.show_topic( 0, topn=100 )

[('user', 0.041085735),
 ('interface', 0.030522605),
 ('system', 0.024684656),
 ('design', 0.022877011),
 ('application', 0.022382043),
 ('tool', 0.01840892),
 ('datum', 0.013578498),
 ('information', 0.011545972),
 ('base', 0.011243712),
 ('context', 0.009158233),
 ('computer', 0.007977735),
 ('task', 0.007876852),
 ('approach', 0.0077653895),
 ('support', 0.007756525),
 ('interactive', 0.0074110245),
 ('develop', 0.0071438057),
 ('use', 0.0064468463),
 ('center', 0.006417939),
 ('build', 0.006319943),
 ('work', 0.00625102),
 ('describe', 0.0061360737),
 ('software', 0.006053359),
 ('program', 0.0060454896),
 ('provide', 0.0060333945),
 ('object', 0.005975107),
 ('create', 0.0056849057),
 ('widget', 0.005659753),
 ('graphical', 0.005615658),
 ('organization', 0.0055324063),
 ('new', 0.005347468),
 ('interaction', 0.0052519827),
 ('building', 0.005149192),
 ('goal', 0.005091789),
 ('allow', 0.005062639),
 ('include', 0.004897512),
 ('enable', 0.0046067275),
 ('demonstrate', 0.004515798

In [None]:
# let's get the topic distribution for the first article in the corpus
# note that the input needs to be bow counts (and the words need to use ids in the vocabulary)
lda.get_document_topics(corpusbow[0])

[(0, 0.31118113),
 (4, 0.03645061),
 (6, 0.013331685),
 (7, 0.3256694),
 (12, 0.3054687)]

In [None]:
# you can get the list of topics by their strength of association with the article by sorting the outputs
sorted(lda.get_document_topics(corpusbow[0]), key=lambda t:t[1], reverse=True)

[(7, 0.32565862),
 (0, 0.31124032),
 (12, 0.30542347),
 (4, 0.03644967),
 (6, 0.013329453)]