In [7]:
import nltk
import string
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pprint import pprint

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt

# Perform LDA using Gensim package

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
punc1 = '!"#$%\'()*.:;<=>?@[\\]^`{|}~’“”‘–-' # adapted string.punctuation
punc2 = ['=', '/', '&', '_', '+', '…', '...']

In [None]:
# stopwords list 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
additional_list = ['doin','nn','rd','st','wheres','hows','clean','theyve', 'weve', 'youve', 'de','u', 'yer', 'stuff','cause','mhm', 'mmhm','itit', 'youyou', 'ah', 'ifif', 'there', 'kinda', 'le','ill', 'hell', 'shell', 'whats', 'isnt', 'thats', 'theyve', 'arent', 'couldnt', 'didnt', 'hadnt', 'hasnt', 'werent', 'havent','dont', 'wont', 'cant', 'wouldnt', 'id', 'ive', 'gonna', 'hed', 'shouldnt', 'ii','dr','cuz','im','youre', 'hes', 'shes', 'were', 'theyre', 'thethe','theyll', 'youll', 'andand', 'th', 'thatthat', 'sthat', 'wewe','ti','u', 'heh', 'le', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'ya','nd', 'uhh', 's','d','t','by', 'don', 're', 'oh', 'ah', 'ahh', 'm', 'ok', 'okay', 'md', 'like','uh','uhum', 'go', 'got', 'yeah', 'okay', 'yep','uhm', 'umm', 'hum', 'na', 'um', 'legend', 'hmm', 'ah', 'na', 'mm', 'mmm', 'da', 'mmhmm', 'mmmhmm', 'yup', 'hm', 'know', 'would', 'get', 'other', 'huh']
stop_words.extend(additional_list)

In [None]:
# Functions to help with processing text
def remove_parentheses(txt):
    txt = re.sub('\([^)]*\)\)','', txt) # remove double parentheses 
    txt = re.sub(r'\([^)]*\)', '', txt) # remove single parentheses 
    return txt

def remove_numerical(txt):
    txt = re.sub('[0-9]+', '', txt)
    return txt

def remove_punc(txt):
    for a in punc2:
        txt = txt.replace(a," ")
    for b in punc1:
        txt = txt.replace(b,"")
    return txt

def lowercase(txt):
    txt = txt.lower()
    return txt

In [None]:
remove_numerical('I lov3 food. I 8 a pi55a for lunch.')

In [None]:
# Functions to help with tokenizing, cleaning up text
def sent_to_words(document):
    return gensim.utils.simple_preprocess(str(document), deacc=True) # deacc=True removes punctuations

def remove_stopwords(document):
    return [word for word in document if word not in stop_words]

def lemma_tokens(tokens, lemmatizer = WordNetLemmatizer()):
    lemmed = []
    for item in tokens:
        lemmed.append(lemmatizer.lemmatize(item))
    return lemmed

# Process Texts

In [None]:
df = pd.read_csv('all_transcripts.csv')
df.head()

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_copy = df # in case we need to revisit later on, we have a copy

In [None]:
# we are cleaning up each of the conversations in Convo_1 column and then tokenizing the texts
conversations_tokenized = [] 
for index, row in df.iterrows():
    if pd.isnull(row['Convo_1']) == False:
        # remove parentheses, numbers, punctuation, and convert everything to lowercase
        row['Convo_1'] = remove_parentheses(row['Convo_1'])
        row['Convo_1'] = remove_numerical(row['Convo_1'])
        row['Convo_1'] = remove_punc(row['Convo_1'])
        row['Convo_1'] = lowercase(row['Convo_1'])
        df.set_value(index,'Convo_1', row['Convo_1'])
        conversations_tokenized.append(sent_to_words(row['Convo_1'])) # create new column of processed, tokenized doucments

In [None]:
# remove stopwords
conversations_nostops = remove_stopwords(conversations_tokenized)

In [None]:
# lemmatize the words
conversations_lemmatized = []
for doc in conversations_nostops:
    conversations_lemmatized.append(lemma_tokens(doc))

# Create Bag of Words Model

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(conversations_lemmatized)

In [None]:
print(id2word)

In [None]:
# Creating Term Document Frequencies
corpus = [id2word.doc2bow(text) for text in conversations_lemmatized]

In [None]:
len(id2word)

In [None]:
# Filter Dictionary based on the following criteria
id2word.filter_extremes(no_below=3, no_above = 0.75, keep_n=7000, keep_tokens=None) 

In [None]:
# Create BOW Model
corpus = [id2word.doc2bow(text) for text in conversations_lemmatized]

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Create LDA Model

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=25,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# Topic Coherence

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=conversations_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Look at how the various parameters impact coherence score (see how each parameter impacts in isolation versus more of a grid-search approach)

Brief non-technical explanation of topic coherence: https://www.quora.com/What-is-topic-coherence

### No_below
- Attempt 1: Coherence Score = 0.3878
- Attempt 2: no_below = 2, Coherence Score = 0.4577
- Attempt 3: no_below = 3, Coherence Score = 0.4577
- Attempt 4: no_below = 5, Coherence Score = 0.3878
- no_above = 0.8, keep_n = 5000, chunk_size = 50, update_every = 1, passes = 10

### No_above
- Attempt 1: Coherence Score = 0.45206915803222947
- Attempt 2: no_above = 0.9, Coherence Score = 0.4372
- Attempt 3: no_above = 0.8, Coherence Score = 0.4577
- Attempt 4: no_above = 0.75, Coherence Score = 0.4611
- Attempt 5: no_above = 0.70, Coherence Score = 0.4206
- no_below = 3, keep_n = 5000, chunk_size = 50, update_every = 1, passes = 10

### Keep_n
- Attempt 1: keep_n = 3000, Coherence Score:  0.4555
- Attempt 2: keep_n = 5000, Coherence Score:  0.4611
- Attempt 3: keep_n = 7000, Coherence Score:  0.4747
- no_below = 3, no_above = 0.75, chunk_size = 50, update_every = 1, passes = 10

### Num_passes
- Attempt 1: Passes = 5, Coherence Score = 0.4347
- Attempt 2: Passes = 10, Coherence Score = 0.4819
- Attempt 3: Passes = 20, Coherence Score = 0.4965
- no_below = 3, no_above = 0.75, keep_n = 7000, chunk_size = 50, update_every = 1

- Attempt 1: Passes = 5, Coherence Score = 0.4627
- Attempt 2: Passes = 10, Coherence Score = 0.4747
- Attempt 3: Passes = 20, Coherence Score = 0.5073
- no_below = 3, no_above = 0.75, keep_n = 7000, chunk_size = 25, update_every = 2

### Num_topics
- Attempt 1: num_topics = 5, Coherence Score = 0.3472
- Attempt 2: num_topics = 10, Coherence Score = 0.4747
- Attempt 3: num_topics = 15, Coherence Score = 0.3778
- no_below = 3, no_above = 0.75, keep_n = 7000, chunk_size = 25, update_every = 2, passes = 10

# Visualizing Topics

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In this plot, we would want non-intersecting topics (bubbles) in different coordinates. The size of the bubble represents the prevalence of the topic, and we would expect some to be larger than others (although we want them to be generally large because that means that each topic is important, versus having many small bubbles). However, we mostly see overlapping topics of the same size in our topic models, which means that the topic model did not identify very distinct topics.

# Troubleshooting Word Processing

In [None]:
for index, row in df.iterrows():
    for word in ['yer']: # de
        if pd.isnull(row['Convo_1']) == False and word in gensim.utils.simple_preprocess(str(row['Convo_1']), deacc=True):
            print(row)