<a href="https://colab.research.google.com/github/dpstyner/Technical-Tutorial/blob/main/ldaexample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#Run if needed
!pip install --upgrade gensim
!pip install numpy
!pip install pandas
!pip install spacy
!pip install nltk

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)


In [7]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy



In [10]:
# NLTK Stop words
import nltk as nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [27]:
# Import Dataset
from google.colab import files
uploaded = files.upload()



Saving fordemo.csv to fordemo.csv


In [29]:
import io
df = pd.read_csv(io.BytesIO(uploaded['fordemo.csv']))
df.head()
# Dataset is now stored in a Pandas Dataframe

Unnamed: 0,Reviewed,Reviewer,Date,Content
0,138488.0,41385.0,Dec 19 2020,\nHelp breast cancer researchers by answering ...
1,138489.0,104269.0,Dec 19 2020,\nAnswer a survey about your personality\n
2,138490.0,90240.0,Dec 19 2020,\nYour perspectives on romantic relationships\n
3,138491.0,96716.0,Dec 19 2020,\nAnswer a 5 minute survey about social situat...
4,138492.0,138265.0,Dec 19 2020,\nSimple transcription from image (~1 minute)\n


In [44]:
# Convert to list
data = df.Content.values.tolist()
data
# Remove Nans
data = [x for x in data if str(x) != 'nan']

#Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
data

#Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]
pprint(data[:1])

[' Help breast cancer researchers by answering questions about pictures of '
 'mannequin ($0.5, ~10 min) ']


In [46]:
#Use Genism to Tokenize
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(data))

print(data_words[:1])

[['help', 'breast', 'cancer', 'researchers', 'by', 'answering', 'questions', 'about', 'pictures', 'of', 'mannequin', 'min']]


In [47]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['help', 'breast', 'cancer', 'researchers', 'by', 'answering', 'questions', 'about', 'pictures', 'of', 'mannequin', 'min']


In [49]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [50]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['help', 'breast', 'cancer', 'researcher', 'answer', 'question', 'picture']]


In [51]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]


In [52]:
id2word[0]

'answer'

In [53]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('answer', 1),
  ('breast', 1),
  ('cancer', 1),
  ('help', 1),
  ('picture', 1),
  ('question', 1),
  ('researcher', 1)]]

In [54]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [55]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.013*"making" + 0.013*"short" + 0.013*"doxe" + 0.013*"story" + '
  '0.013*"speaker" + 0.013*"read" + 0.013*"native" + 0.013*"test" + '
  '0.013*"opinion" + 0.013*"decision"'),
 (1,
  '0.082*"survey" + 0.082*"uniform" + 0.082*"badge" + 0.082*"answer" + '
  '0.082*"staff" + 0.082*"worker" + 0.082*"hit" + 0.082*"designate" + '
  '0.082*"compensation" + 0.004*"ended"'),
 (2,
  '0.157*"minute" + 0.080*"diversity" + 0.080*"inclusion" + 0.080*"lesson" + '
  '0.080*"reaction" + 0.079*"experience" + 0.079*"game" + 0.079*"answer" + '
  '0.079*"question" + 0.079*"play"'),
 (3,
  '0.123*"answer" + 0.083*"researcher" + 0.083*"help" + 0.083*"breast" + '
  '0.083*"question" + 0.083*"cancer" + 0.083*"picture" + 0.043*"ended" + '
  '0.043*"opinion" + 0.043*"social"'),
 (4,
  '0.013*"making" + 0.013*"short" + 0.013*"doxe" + 0.013*"story" + '
  '0.013*"speaker" + 0.013*"read" + 0.013*"native" + 0.013*"test" + '
  '0.013*"opinion" + 0.013*"decision"'),
 (5,
  '0.156*"supplier" + 0.156*"disruption