In [None]:
import nltk
import string
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pprint import pprint

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Perform LDA using Gensim package

In [None]:
punc1 = '!"#$%\'()*.:;<=>?@[\\]^`{|}~’“”'
punc2 = ['-', '=', '/', '&', '_', '+', '…', '...']

In [None]:
stopwords.words('english')

In [None]:
# stopwords list 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
additional_list = ['stuff','cause','mhm', 'mmhm','itit', 'youyou', 'ah', 'ifif' 'there', 'kinda', 'le','ill', 'hell', 'shell', 'whats', 'isnt', 'thats', 'theyve', 'arent', 'couldnt', 'didnt', 'hadnt', 'hasnt', 'werent', 'havent','dont', 'wont', 'cant', 'wouldnt', 'id', 'ive', 'gonna', 'hed', 'shouldnt', 'ii','dr','cuz', 'im','youre', 'hes', 'shes', 'were', 'theyre', 'thethe','theyll', 'youll', 'andand', 'th', 'thatthat', 'sthat', 'wewe','ti','u', 'heh', 'le', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'ya','nd', 'uhh', 's','d','t','by', 'cancer', 'don', 're', 'prostate', 'oh', 'ah', 'ahh', 'm', 'ok', 'okay', 'md', 'like','uh','uhum', 'go', 'got', 'yeah', 'okay', 'yep','uhm', 'umm', 'hum', 'na', 'md', 'so', 'pt', 'oth', 'um', 'legend', 'hmm', 'ah', 'na', 'mm', 'mmm', 'da', 'mmhmm', 'mmmhmm', 'yup', 'hm', 'know', 'would', 'get', 'other', 'huh']
stop_words.extend(additional_list)

In [None]:
# Functions to help with processing text
def remove_parentheses(txt):
    txt = re.sub('\([^)]*\)\)','', txt) # remove double parentheses 
    txt = re.sub(r'\([^)]*\)', '', txt) # remove single parentheses 
    return txt

def remove_numerical(txt):
    txt = re.sub('[0-9]+', '', txt)
    return txt

def remove_punc(txt):
    for a in punc2:
        txt = txt.replace(a," ")
    for b in punc1:
        txt = txt.replace(b,"")
    return txt

def lowercase(txt):
    txt = txt.lower()
    return txt

In [None]:
# Functions to help with tokenizing, cleaning up text
def sent_to_words(document):
    return gensim.utils.simple_preprocess(str(document), deacc=True) # deacc=True removes punctuations

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def lemma_tokens(tokens, lemmatizer = WordNetLemmatizer()):
    lemmed = []
    for item in tokens:
        lemmed.append(lemmatizer.lemmatize(item))
    return lemmed

# Process Text

In [None]:
df = pd.read_csv('all_transcripts.csv')
df.head()

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_copy = df # in case we need to revisit later on

In [None]:
# clean up each of the conversations in convo_1 column
conversations_tokenized = [] # we are creating new column of cleaned up, tokenized doucments
for index, row in df.iterrows():
    if pd.isnull(row['Convo_1']) == False:
        row['Convo_1'] = remove_parentheses(row['Convo_1'])
        row['Convo_1'] = remove_numerical(row['Convo_1'])
        row['Convo_1'] = remove_punc(row['Convo_1'])
        row['Convo_1'] = lowercase(row['Convo_1'])
        df.set_value(index,'Convo_1', row['Convo_1'])
        conversations_tokenized.append(sent_to_words(row['Convo_1']))

In [None]:
# remove stopwords
conversations_nostops = remove_stopwords(conversations_tokenized)

In [None]:
# lemmatize the words
conversations_lemmatized = []
for doc in conversations_nostops:
    conversations_lemmatized.append(lemma_tokens(doc))

# Create Bag of Words Model

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(conversations_lemmatized)

In [None]:
print(id2word)

In [None]:
# Creating Term Document Frequencies
corpus = [id2word.doc2bow(text) for text in conversations_lemmatized]

In [None]:
len(id2word)

In [None]:
# Filter Dictionary based on the following criteria
id2word.filter_extremes(no_below=3, no_above=0.80, keep_n=5000, keep_tokens=None)

In [None]:
# Create BOW Model
corpus = [id2word.doc2bow(text) for text in conversations_lemmatized]

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Create LDA Model

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# TODO: Visualize Topics, Compute Coherence Score, Compare Various Parameters

# Troubleshooting Word Processing

In [None]:
# for index, row in df.iterrows():
#     for word in ['twelve']: # stuff, twelve
#         if pd.isnull(row['Convo_1']) == False and word in row['Convo_1']:
#             print(row)