# Finding the Best Topics

It's now time for us to stop playing with LDA and agree on (the parameters that make) the best topics. To do so, we will vary the following parameters until we find the clearest topics (to our eyes):
- number of topics (2-20)
- filtering out extreme values
- only using transcripts for patients that we have txgot_binary for

Let's get started.

## Set Up
### Import Statements and Load Datasets

In [127]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import re
import numpy as np
import pandas as pd
from pprint import pprint

#gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
from gensim.models import NormModel

#spacy for lemmatization
import spacy

#plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samanthagarland/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
#read in our transcripts
transcript_df = pd.read_csv("/Users/samanthagarland/Downloads/shared_data_merged.csv")

dvd = transcript_df[transcript_df["Study"] == "DVD"]
va = transcript_df[transcript_df["Study"] == "VA"]

#from Preethi
dvd = dvd[(pd.isnull(dvd.Convo_1) == False) & (pd.isnull(dvd.txgot_binary) == False)]
va = va[(pd.isnull(va.Convo_1) == False) & (pd.isnull(va.txgot_binary) == False)]

convo_dvd = dvd["Convo_1"]
convo_va = va["Convo_1"]

dvd.head()

Unnamed: 0.1,Unnamed: 0,Tx3,Advice1,Anx11,Anx111,Anx112,Anx113,Anx12,Anx13,Anx51,...,pacific,psa1,raceother,txgot,txgot_binary,white,Convo_1,Convo_2,Doctor_1,Doctor_2
263,263,1.0,SR,,,0.0,0.0,0.0,1.0,,...,0.0,5.9,0,1.0,1.0,1.0,l394r1 interview length 014034 legend pt patie...,l394u2 interview length 003455 legend pt patie...,,
264,264,2.0,A,0.0,1.0,,0.0,,0.0,1.0,...,0.0,3.6,0,2.0,0.0,0.0,l371u1 interview length 004701 legend pt patie...,l371r2 interview length 005058 legend pt patie...,,
265,265,2.0,A,2.0,3.0,2.0,2.0,3.0,2.0,2.0,...,0.0,5.9,0,2.0,0.0,1.0,t062u1 interview length 02507 legend pt patien...,,,
266,266,2.0,A,2.0,1.0,,3.0,,2.0,1.0,...,0.0,2.8,0,2.0,0.0,1.0,t099u1 interview length 005257 legend pt patie...,,,
271,271,,R,2.0,0.0,1.0,2.0,2.0,2.0,0.0,...,0.0,6.3,0,1.0,1.0,1.0,l039u1 interview length 03839 legend pt patien...,,,


### Methods

Here we define the methods we'll use later in the process.

In [129]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopWords] for doc in texts]

def make_bigrams_dvd(texts):
    return [bigram_mod_dvd[doc] for doc in texts]

def make_trigrams_dvd(texts):
    return [trigram_mod_dvd[bigram_mod_dvd[doc]] for doc in texts]

def make_bigrams_va(texts):
    return [bigram_mod_va[doc] for doc in texts]

def make_trigrams_dvd(texts):
    return [trigram_mod_va[bigram_mod_va[doc]] for doc in texts]

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out        

def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


### Stop Words

We take the standard package stopwards and add on our specific stop words. See Stop Word Rules for our process for finding stop words.

In [130]:
stopWords = stopwords.words('english')

stopWords = set([word.replace("'", "") for word in stopWords])

stopWords = stopWords.union(set(["taiwan", "taiwanese", "communist", "mmmhmm", "'", "'cause", "'em", 'a', 'aa', 'aaah', 'aah', 'ab', 'about', 'above', 'african', 'after', 'again', 'against', 'ah', 'ahh', 'ahhh', 'ahhhh', 'ahhm', 'ain', 'aint', 'alabama', 'alaska', 'all', 'alot', 'alright', 'alrighty', 'also', 'am', 'an', 'anand', 'and', 'andand', 'any', 'anyone', 'are', 'aren', 'arent', 'as', 'at', 'ay', 'b', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'bye', 'c', 'california', 'came', 'can', 'cant', 'clean', 'costa_rica', 'could', 'couldn', 'couldnt', 'cuz', 'd', 'de', 'did', 'didn', 'didnt', 'do', 'doc', 'does', 'doesn', 'doesnt', 'doin', 'doing', 'dokey', 'don', 'dont', 'down', 'during', 'e', 'each', 'eek', 'eh', 'em', 'er', 'et', 'etc', 'europe', 'f', 'few', 'florida','for', 'from', 'further', 'g', 'ga', 'gal', 'gee', 'geez', 'germany', 'get', 'go', 'goin', 'going', 'gonna', 'gosh', 'got', 'gotta', 'greek', 'gu', 'h', 'ha', 'had', 'hadn', 'hadnt', 'has', 'hasn', 'hasnt', 'have', 'haven', 'havent', 'having', 'he', 'hed', 'heh', 'hell', 'hello', 'henry', 'her', 'here', 'hers', 'herself', 'hes', 'hey', 'hi', 'him', 'himself', 'his', 'hm', 'hmm', 'hmmm', 'hodgkins', 'how', 'hows', 'huh', 'hum', 'i', 'id', 'if', 'ifif', 'ii', 'iii', 'ill', 'im', 'imrt', 'in', 'inaudible', 'indecipherable', 'indianapolis', 'into', 'is', 'isis', 'isn', 'isnt', 'it', 'itd', 'itit', 'itll', 'its', 'itself', 'ive', 'j', 'jeez', 'just', 'k', 'kay', 'kinda', 'l', 'laughs', 'le', 'leastno', 'legend', 'let', 'lets', 'like', 'll', 'look', 'lot', 'm', 'ma', 'maam', 'md', 'mdmd', 'me', 'mhm', 'mhmm', 'mhmmm', 'michigan', 'mightn', 'mightnt', 'mightve', 'mkay', 'mm', 'mmhm', 'mmhmm', 'mmkay', 'mmm', 'mmmhmm','mmmhmmm', 'mmmm', 'mmmmm', 'more', 'most', 'mustn', 'mustnt', 'mustve', 'my', 'myself', 'n', 'na', 'nah', 'nahuh', 'nd', 'ne', 'needn', 'neednt', 'nn', 'no', 'nooh', 'noooo', 'nope', 'nor', 'not', 'now', 'o', 'of', 'off', 'oh', 'ohh', 'ohhh', 'ohhhohohohoh', 'ohio', 'ok', 'okay', 'okey', 'on', 'once', 'only', 'oooh', 'or', 'oth', 'other', 'othumhmm', 'oughta', 'our', 'ours', 'ourselves', 'out', 'over', 'ow', 'own', 'p', 'patient', 'phi', 'physician', 'potter', 'pt', 'pt/so', 'q', 'r', 'rd', 're', 'right', 'ro', 's', 'said', 'same', 'say', 'see', 'shan', 'shant', 'she', 'shell', 'shes', 'should', 'shouldn', 'shouldnt', 'shouldve', 'so', 'some', 'sorta', 'sounds', 'st', 'stuff', 'such', 'swedish', 't', 'th', 'than', 'that', 'thatd', 'thatll', 'thats', 'thatsthat', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'thered', 'thereof', 'theres', 'thereve', 'these', 'thethe', 'thew', 'they', 'theyll', 'theyre', 'theyve', 'thing', 'things', 'this', 'those', 'through', 'ti', 'to', 'too', 'tthe', 'u', 'uh', 'uhh', 'uhhhhh', 'uhhm', 'uhhmm', 'uhhuh', 'uhm', 'uhmhmm', 'uhmhmmm', 'uhmmm', 'uhoh', 'uhum', 'um', 'umhmm', 'umhmmm', 'umm', 'ummm', 'ummmm', 'un', 'under', 'unhunh', 'until', 'up', 'us', 'uuh', 'v', 've', 'very', 'vietnam', 'virginia', 'w', 'walsh', 'wanna', 'was', 'washington', 'wasn', 'wasnt', 'we', 'wed', 'well', 'went', 'were', 'weren', 'werent', 'weve', 'wewe', 'what', 'whatd', 'whatev', 'whatever', 'whatnot', 'whats', 'when', 'where', 'wheres', 'whew', 'which', 'while', 'who', 'whoa', 'whom', 'whos', 'why', 'will', 'with', 'won', 'wont', 'would', 'wouldn', 'wouldnt', 'x', 'y', 'ya', 'yada', 'yah', 'yall', 'yea', 'yeah', 'yep', 'yepvery', 'yer', 'yeyeah', 'you', 'youd', 'youl', 'youll', 'your', 'youre', 'yours', 'yourself', 'yourselves', 'youve', 'youyou', 'yup', 'z']))
print("We have", len(stopWords), "stop words.")

We have 432 stop words.


## Pre Processing
### Build Bigrams and Trigrams

In [131]:
data_words_dvd = list(sent_to_words(convo_dvd))
data_words_va = list(sent_to_words(convo_va))

In [132]:
# Build the bigram and trigram models
bigram_dvd = gensim.models.Phrases(data_words_dvd, min_count=2, threshold=100) 
trigram_dvd = gensim.models.Phrases(bigram_dvd[data_words_dvd], threshold=100)  

bigram_va = gensim.models.Phrases(data_words_va, min_count=2, threshold=100) 
trigram_va = gensim.models.Phrases(bigram_va[data_words_va], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod_dvd = gensim.models.phrases.Phraser(bigram_dvd)
trigram_mod_dvd = gensim.models.phrases.Phraser(trigram_dvd)

bigram_mod_va = gensim.models.phrases.Phraser(bigram_va)
trigram_mod_va = gensim.models.phrases.Phraser(trigram_va)




### Remove Stop Words and Lemmatize the Texts

In [133]:
#remove stop words
data_words_nostops_dvd = remove_stopwords(data_words_dvd)
data_words_nostops_va = remove_stopwords(data_words_va)

#form bigrams
data_words_bigrams_dvd = make_bigrams_dvd(data_words_nostops_dvd)
data_words_bigrams_va = make_bigrams_va(data_words_nostops_va)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_dvd = lemmatization(data_words_bigrams_dvd, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized_va = lemmatization(data_words_bigrams_va, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_lemmatized_dvd = [[word for word in convo if word not in stopWords] for convo in data_lemmatized_dvd]
data_lemmatized_va = [[word for word in convo if word not in stopWords] for convo in data_lemmatized_va]

In [134]:
dvd["Convo_1_lemmatized"] = data_lemmatized_dvd

va["Convo_1_lemmatized"] = data_lemmatized_va

### Create Corpus

In [203]:
id2word_dvd = corpora.Dictionary(data_lemmatized_dvd)
id2word_va = corpora.Dictionary(data_lemmatized_va)

#Filter dictionary
id2word_dvd.filter_extremes(no_below = 0.30, no_above = 0.60, keep_n = 5000, keep_tokens = None)
id2word_va.filter_extremes(no_below = 0.2, no_above = 0.60, keep_n = 5000, keep_tokens = None)


#creates corpus
texts_dvd = data_lemmatized_dvd
texts_va = data_lemmatized_va

print(len(texts_va))

#tdf
corp_dvd = [id2word_dvd.doc2bow(text) for text in texts_dvd]
corp_va = [id2word_va.doc2bow(text) for text in texts_va]

216


In [204]:
dvd["Convo_1_corp"] = corp_dvd

va["Convo_1_corp"] = corp_va

For each patient, we now have their full transcript data, this data lemmatized and with stop words removed, and the corpus from the transcript. Let's run LDA now.

## LDA Model

### DVD Topics

In [161]:
#dvd
lda_model_dvd = gensim.models.ldamodel.LdaModel(corpus=corp_dvd,
                                           id2word=id2word_dvd,
                                           num_topics=12, 
                                           random_state=100,
                                           update_every=3,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [154]:
#view these topics
print("DVD Topics:")
pprint(lda_model_dvd.print_topics())
doc_lda_dvd = lda_model_dvd[corp_dvd]

DVD Topics:
[(0,
  '0.009*"walk" + 0.007*"medicine" + 0.007*"belly" + 0.006*"lump" + '
  '0.006*"bump" + 0.006*"smoke" + 0.006*"dad" + 0.005*"heart_attack" + '
  '0.005*"cure" + 0.005*"seed"'),
 (1,
  '0.007*"cell" + 0.005*"rectum" + 0.005*"situation" + '
  '0.005*"active_surveillance" + 0.004*"main" + 0.004*"medication" + '
  '0.004*"exam" + 0.004*"generally" + 0.004*"book" + 0.004*"everybody"'),
 (2,
  '0.014*"erectile_disfunction" + 0.008*"soo" + 0.008*"rigid" + '
  '0.008*"involves_remov" + 0.007*"impair" + 0.004*"chemo" + 0.003*"toxic" + '
  '0.003*"mindset" + 0.003*"aww" + 0.002*"engage"'),
 (3,
  '0.010*"cd" + 0.010*"dominant" + 0.005*"indian" + 0.003*"mail" + 0.002*"rea" '
  '+ 0.002*"scandinavia" + 0.002*"scene" + 0.001*"nutrition" + '
  '0.001*"impressive" + 0.001*"hike"'),
 (4,
  '0.000*"active_surveillance" + 0.000*"cell" + 0.000*"dose" + '
  '0.000*"difference" + 0.000*"intermediate" + 0.000*"seed" + 0.000*"rectum" + '
  '0.000*"sample" + 0.000*"gland" + 0.000*"implant"'),

In [190]:
print("DVD:")
print('\nPerplexity: ', lda_model_dvd.log_perplexity(corp_dvd)) #a measure of how good the model is

coherence_model_lda_dvd = CoherenceModel(model=lda_model_dvd, texts=data_lemmatized_dvd, dictionary=id2word_dvd, coherence='c_v')
coherence_lda_dvd = coherence_model_lda_dvd.get_coherence()
print('\nCoherence Score: ', coherence_lda_dvd)

print()


DVD:

Perplexity:  -7.817201225103486

Coherence Score:  0.3482593101506253



### Visualizations

In [140]:
print("DVD:")
pyLDAvis.enable_notebook()
vis_dvd = pyLDAvis.gensim.prepare(lda_model_dvd, corp_dvd, id2word_dvd)
vis_dvd

DVD:


#### Our Topics:
- Visualization 1 == surgery == Topic 8
- Visualization 2 == radiation == Topic 6
- Visualization 4 == surveillance == Topic 10
- Visualization 5 == appointment == Topic 0

What combination produces the best topics?
- 15 topics, filters as 0.3-0.6
    - get radiation (2), active surveillance (3), surgery (albeit weak) (1)
- 2 topics, filters at 0.3-0.6
    - topics aren't clear at all
- 4 topics, filters at 0.3-0.6
    - don't get a strong surgery one
- 5 topics, filters at 0.3-0.6
    - active surveillance, two weak surgery topics, radiation
- 8 topics, filters at 0.3-0.6
    - kinda surgery, radiation, surveillance, appointment stuff
- 10 topics, filters at 0.3-0.6
    - good radiation topic, good active surveillance topic, good appointment topic, surgery topic still weak
- 12 topics, filters at 0.3-0.6
    - good radiation, okay surgery, good surveillance, good appointment
- 14 topics, filters at 0.3-0.6
    - about the same
- 12 topics, filters at 0.25-0.75
    - topics are messed up, not very clear
- 12 topics, filters at 0.3-0.7
    - have topics, but they're not as good as filters at 0.3-0.6
- 12 topics, filters at 0.35-0.65
    - decent surgery topic, good radiation topic, even better surgery topic, good appt topic, OK surveillance topic
==> best topics seem to come from 12 topics, filters at 0.3-0.6

What combination produces the best topics?
- 12 topics, filters at 0.3-0.6 (best for DVD)
    - good surgery topic!, good DA topic, outcomes/side effect topic?, tiny little surveillance topic, but didn't really see a radiation topic

In [144]:
weLikeThese = [0,6,8,10]
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = row[0] #row = [t for t in row[0] if t[0] in weLikeThese]
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_dvd, corpus=corp_dvd, texts=data_lemmatized_dvd)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(100)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,8.0,0.5008,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, significant, saw, surgery, ..."
1,1,8.0,0.7188,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, significant, person, solid,..."
2,2,8.0,0.3900,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, significant, significant, l..."
3,3,8.0,0.9743,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, person, little, red_light, ..."
4,4,1.0,0.2857,"cell, rectum, situation, active_surveillance, ...","[interview_length, dr, senior, resident, work,..."
5,5,8.0,0.5405,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, significant, good, complain..."
6,6,8.0,0.8853,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, significant, person, fabulo..."
7,7,1.0,0.8659,"cell, rectum, situation, active_surveillance, ...","[interview_length, significant, mean, think, h..."
8,8,8.0,0.6892,"erectile_dysfunction, erectile, cell, robot, t...","[interview, time, significant, research_assist..."
9,9,8.0,0.8320,"erectile_dysfunction, erectile, cell, robot, t...","[interview_length, significant, awhile, nice, ..."


In [145]:
df_dominant_topic.Dominant_Topic.unique()

array([ 8.,  1., 10.,  9.,  0.,  7.,  6.])

In [146]:
#from Preethi
distributions = lda_model_dvd[corp_dvd]

In [152]:
# corpus_va is the corpus that remains after filtering out null txgot_binary and convo_1 values
dvd_length = len(corp_dvd) 
topic0 = [0] * dvd_length
topic6 = [0] * dvd_length
topic8 = [0] * dvd_length
topic10 = [0] * dvd_length

# store the topic percentage values for the relevant topics
for en, row in enumerate(distributions):
    topics = row[0]
    for topic in topics:
        if topic[0] == 0:
            topic0[en] = topic[1]
        elif topic[0] == 6:
            topic6[en] = topic[1]
        elif topic[0] == 8:
            topic8[en] = topic[1]
        elif topic[0] == 10:
            topic10[en] = topic[1]

dvd['radiation_topic'] = topic6
dvd['active_surveillance_topic'] = topic10
dvd['appt_topic'] = topic0
dvd['surgery_topic'] = topic8
            
dvd.head(50)

Unnamed: 0.1,Unnamed: 0,Tx3,Advice1,Anx11,Anx111,Anx112,Anx113,Anx12,Anx13,Anx51,...,Convo_1,Convo_2,Doctor_1,Doctor_2,Convo_1_lemmatized,Convo_1_corp,radiation_topic,active_surveillance_topic,appt_topic,surgery_topic
263,263,1.0,SR,,,0.0,0.0,0.0,1.0,,...,l394r1 interview length 014034 legend pt patie...,l394u2 interview length 003455 legend pt patie...,,,"[interview_length, significant, saw, surgery, ...","[(0, 1), (1, 2), (2, 1), (3, 4), (4, 31), (5, ...",0.380425,0.013018,0.0,0.50084
264,264,2.0,A,0.0,1.0,,0.0,,0.0,1.0,...,l371u1 interview length 004701 legend pt patie...,l371r2 interview length 005058 legend pt patie...,,,"[interview_length, significant, person, solid,...","[(2, 1), (4, 2), (7, 1), (8, 3), (12, 2), (16,...",0.107739,0.0,0.172553,0.718799
265,265,2.0,A,2.0,3.0,2.0,2.0,3.0,2.0,2.0,...,t062u1 interview length 02507 legend pt patien...,,,,"[interview_length, significant, significant, l...","[(4, 1), (9, 1), (12, 1), (17, 1), (19, 2), (2...",0.173313,0.119528,0.189936,0.390042
266,266,2.0,A,2.0,1.0,,3.0,,2.0,1.0,...,t099u1 interview length 005257 legend pt patie...,,,,"[interview_length, person, little, red_light, ...","[(2, 1), (3, 1), (8, 2), (9, 1), (14, 2), (17,...",0.024237,0.0,0.0,0.974346
271,271,,R,2.0,0.0,1.0,2.0,2.0,2.0,0.0,...,l039u1 interview length 03839 legend pt patien...,,,,"[interview_length, dr, senior, resident, work,...","[(2, 1), (4, 9), (5, 1), (12, 1), (16, 1), (21...",0.034959,0.241873,0.0,0.258541
272,272,,SR,2.0,0.0,0.0,0.0,2.0,0.0,1.0,...,t042u1 interview length 001853 legend pt patie...,,,,"[interview_length, significant, good, complain...","[(2, 1), (15, 1), (27, 7), (40, 2), (43, 1), (...",0.0,0.243448,0.0,0.540551
274,274,,R,2.0,2.0,1.0,2.0,1.0,2.0,1.0,...,l044u1 interview length 03806 legend pt patien...,l044r2 interview length 011305 legend pt patie...,,,"[interview_length, significant, person, fabulo...","[(10, 1), (14, 1), (15, 1), (20, 1), (22, 2), ...",0.0,0.0,0.113521,0.88533
275,275,,ASR,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,t048u1 interview length 003149 legend pt patie...,,,,"[interview_length, significant, mean, think, h...","[(1, 1), (3, 2), (4, 7), (6, 2), (16, 1), (19,...",0.03216,0.098015,0.0,0.0
277,277,,A,3.0,2.0,2.0,2.0,3.0,3.0,2.0,...,l026u1 interview time 03941 legend pt patient ...,l026r2 interview length 011318 legend pt patie...,,,"[interview, time, significant, research_assist...","[(6, 2), (8, 1), (12, 1), (16, 2), (20, 1), (2...",0.129314,0.0,0.180747,0.689246
278,278,,A,3.0,2.0,3.0,3.0,3.0,3.0,3.0,...,l290u1 interview length 002719 legend pt patie...,,,,"[interview_length, significant, awhile, nice, ...","[(2, 1), (7, 1), (12, 1), (15, 1), (31, 1), (4...",0.151463,0.0,0.0,0.831997


In [153]:
dvd.to_csv("dvd_topic_modeling_with_distributions.csv", header=True)

## VA

Now we do the exact same thing for the VA topics.

In [240]:
#va
lda_model_va = gensim.models.ldamodel.LdaModel(corpus=corp_va,
                                           id2word=id2word_va,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=3,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [241]:
print("VA Topics:")
pprint(lda_model_va.print_topics())
doc_lda_va = lda_model_va[corp_va]

VA Topics:
[(0,
  '0.014*"recommendation" + 0.011*"conjunction" + 0.011*"safer" + '
  '0.009*"hunch" + 0.009*"sixtyseven" + 0.008*"gi" + 0.008*"urinalysis" + '
  '0.007*"radial" + 0.007*"mortality" + 0.007*"trickle"'),
 (1,
  '0.007*"procedure" + 0.007*"seed" + 0.007*"basically" + 0.005*"tissue" + '
  '0.005*"grade" + 0.005*"robotic" + 0.005*"area" + 0.005*"robot" + '
  '0.005*"depend" + 0.005*"incision"'),
 (2,
  '0.000*"area" + 0.000*"basically" + 0.000*"certainly" + 0.000*"incontinence" '
  '+ 0.000*"appointment" + 0.000*"sample" + 0.000*"oncologist" + '
  '0.000*"tissue" + 0.000*"benefit" + 0.000*"already"'),
 (3,
  '0.000*"incontinence" + 0.000*"erectile_dysfunction" + 0.000*"symptom" + '
  '0.000*"grade" + 0.000*"benefit" + 0.000*"term" + 0.000*"life" + '
  '0.000*"afterwards" + 0.000*"affect" + 0.000*"die"'),
 (4,
  '0.009*"proton" + 0.005*"appearance" + 0.002*"upbut" + 0.002*"ccategorize" + '
  '0.002*"gambit" + 0.002*"dris" + 0.002*"medof" + 0.002*"fourandahalf" + '
  '0.002*"

In [242]:
print("VA:")
print('\nPerplexity: ', lda_model_va.log_perplexity(corp_va)) #a measure of how good the model is

coherence_model_lda_va = CoherenceModel(model=lda_model_va, texts=data_lemmatized_va, dictionary=id2word_va, coherence='c_v')
coherence_lda_va = coherence_model_lda_va.get_coherence()
print('\nCoherence Score: ', coherence_lda_va)

VA:

Perplexity:  -7.54501660585149

Coherence Score:  0.36778197856036077


In [243]:
print("VA:")
pyLDAvis.enable_notebook()
vis_va = pyLDAvis.gensim.prepare(lda_model_va, corp_va, id2word_va)
vis_va

VA:


### Valid Topics
- at 0.25-.55, get surgery and active surveillance topics
    - viz 1 == surgery
    - viz 4 == vaguely radiation
    - viz 2 == vaguely surveillance
- at 0.2-0.6
    - viz 1 == treatment
    - viz 2 == kinda surveillance
    - viz 5 == kinda surveillance
    - viz 6 == kinda surveillance
- same filters, at 20 topics
    - viz 1 == treatment == topic 1
    - viz 2 == active surveillance == 18
    - viz 3 == surgery
    - viz 4 == active surveillance
    - viz 5 == active surveillance
    - --> use the top two

Given that we're not getting very strong topics (especially no strong radiation topic), we might just have to do treatment/no treatment topics.

In [251]:
weLikeThese = [1,18]
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = row[0] #row = [t for t in row[0] if t[0] in weLikeThese]
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_va, corpus=corp_va, texts=data_lemmatized_va)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(100)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.5267,"procedure, seed, basically, tissue, grade, rob...","[thank, come, know, move, appointment, upbut, ..."
1,1,8.0,0.5508,"ankle, prostatectomy, scheduler, term, symptom...","[study, staffsetting, recorder, record, turn, ..."
2,2,6.0,0.3953,"cure, open, sit, heart, robotic, cut, basicall...","[thank, kind, talk, last, time, main, issue, u..."
3,3,6.0,0.8695,"cure, open, sit, heart, robotic, cut, basicall...","[loud, pull, pathology_report, pathology_repor..."
4,4,18.0,0.7182,"term, surveillance, grade, die, medication, sy...","[interviewee, biopsy, christma, holiday, resul..."
5,5,1.0,0.4503,"procedure, seed, basically, tissue, grade, rob...","[multitalent, pull, computer, want, review, re..."
6,6,6.0,0.8071,"cure, open, sit, heart, robotic, cut, basicall...","[pull, pathology, pathology, prostate, biopsy,..."
7,7,6.0,0.7971,"cure, open, sit, heart, robotic, cut, basicall...","[shake, thank, participate, study, recover, bi..."
8,8,1.0,0.2878,"procedure, seed, basically, tissue, grade, rob...","[participant, start, question, start, happen, ..."
9,9,6.0,0.4046,"cure, open, sit, heart, robotic, cut, basicall...","[loud, participant, thank, come, back, biopsy,..."


In [252]:
df_dominant_topic.Dominant_Topic.unique()

array([ 1.,  8.,  6., 18.,  5., 17., 10., 14.])

In [253]:
distributions_va = lda_model_va[corp_va]

In [255]:
# corpus_va is the corpus that remains after filtering out null txgot_binary and convo_1 values
va_length = len(corp_va) 
topic1 = [0] * va_length
topic18 = [0] * va_length

# store the topic percentage values for the relevant topics
for en, row in enumerate(distributions_va):
    topics = row[0]
    for topic in topics:
        if topic[0] == 1:
            topic1[en] = topic[1]
        elif topic[0] == 18:
            topic18[en] = topic[1]

va['treatment'] = topic1
va['active_surveillance_topic'] = topic18

va.head(10)

Unnamed: 0.1,Unnamed: 0,Tx3,Advice1,Anx11,Anx111,Anx112,Anx113,Anx12,Anx13,Anx51,...,txgot_binary,white,Convo_1,Convo_2,Doctor_1,Doctor_2,Convo_1_lemmatized,Convo_1_corp,treatment,active_surveillance_topic
0,0,2.0,SR,1.0,1.0,0.0,2.0,3.0,2.0,1.0,...,0.0,1.0,a003 legend md2 physician pt patient md2 so th...,,U,,"[thank, come, know, move, appointment, upbut, ...","[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2...",0.526678,0.281453
1,1,2.0,SR,0.0,0.0,2.0,2.0,2.0,1.0,1.0,...,0.0,1.0,a004 legend md2 physician oth study staffsetti...,,U,,"[study, staffsetting, recorder, record, turn, ...","[(1, 1), (2, 1), (3, 2), (8, 1), (10, 4), (14,...",0.07337,0.253829
2,2,,A,2.0,3.0,2.0,3.0,2.0,2.0,1.0,...,1.0,1.0,a014 legend md2 physician pt patient md2 thank...,,U,,"[thank, kind, talk, last, time, main, issue, u...","[(2, 1), (4, 1), (6, 1), (8, 2), (10, 4), (11,...",0.301053,0.190737
4,4,,SR,2.0,2.0,2.0,2.0,3.0,2.0,2.0,...,1.0,1.0,a016 clean loud legend md2 physician pt patien...,,U,,"[loud, pull, pathology_report, pathology_repor...","[(2, 2), (7, 3), (8, 1), (11, 4), (19, 1), (20...",0.014671,0.074687
5,5,,SR,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,a023 legend md2 physician pt patient md2 alrig...,,U,,"[interviewee, biopsy, christma, holiday, resul...","[(10, 1), (12, 1), (18, 3), (22, 2), (31, 1), ...",0.104722,0.718203
6,6,1.0,ASR,1.0,0.0,0.0,0.0,2.0,0.0,0.0,...,1.0,1.0,a024 legend md2 physician pt patient pt multit...,,U,,"[multitalent, pull, computer, want, review, re...","[(8, 1), (10, 1), (11, 4), (20, 1), (21, 1), (...",0.450327,0.225142
7,7,1.0,ASR,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,a025 legend md2 physician pt patient md2 okay ...,,U,,"[pull, pathology, pathology, prostate, biopsy,...","[(1, 1), (11, 5), (24, 1), (28, 3), (32, 1), (...",0.053698,0.100417
8,8,,S,0.0,2.0,3.0,1.0,2.0,1.0,0.0,...,0.0,1.0,a032 legend md2 physician pt patient md2 youre...,,U,,"[shake, thank, participate, study, recover, bi...","[(3, 1), (7, 3), (10, 1), (11, 3), (16, 3), (1...",0.021289,0.026883
9,9,,ASR,0.0,3.0,1.0,2.0,2.0,3.0,0.0,...,1.0,1.0,a041 legend md2 physician pt patient oth parti...,,U,,"[participant, start, question, start, happen, ...","[(1, 1), (3, 1), (4, 2), (7, 3), (8, 2), (10, ...",0.287755,0.18683
10,10,,S,3.0,2.0,2.0,1.0,3.0,3.0,3.0,...,0.0,1.0,a047 clean loud legend md2 physician pt patien...,,U,,"[loud, participant, thank, come, back, biopsy,...","[(0, 3), (2, 2), (3, 1), (7, 1), (8, 5), (9, 2...",0.145021,0.050031


In [256]:
va.to_csv("va_topic_modeling_with_distributions.csv", header=True)