In [191]:
import docx
import os

Loading the data: getting full text from all the .docx interview files

In [192]:
#function to get all text from a file 
def getText(filename):
    doc = docx.Document(filename)
    fullText = ""
    for para in doc.paragraphs: 
        fullText = fullText + para.text + " "
    return fullText

In [193]:
#change working directory
interview_folder = "C:\\Users\\605665\\Documents\\Student2Student\\Interview Notes"
os.chdir(interview_folder)
os.getcwd()

'C:\\Users\\605665\\Documents\\Student2Student\\Interview Notes'

In [202]:
allText = [] #list for all the interview text 
# function to loop through all the interview notes documents 
for filename in os.listdir(interview_folder):
    if '.docx' in filename:
        allText.append(getText(filename))

In [203]:
len(allText)

38

Pre-processing:
- Tokenization: split text, lowercase, remove puncutation 
- removing words with fewer than 3 characters
- removing stopwords 
- lemmatized: grouping different inflections of words together 
- stemmed: words reduced to root form 

In [115]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\605665\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [204]:
#lemmatize example
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [205]:
#create a new instance of an english stemmer
stemmer = SnowballStemmer('english')

In [206]:
# lemmatization and stemming
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [207]:
#testing preprocessing on a doc 
doc_sample = allText[0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['As', 'a', 'family,', 'we', 'moved', 'from', 'Omaha', 'to', 'Ft.', 'Meade', '(2004),', 'Ft.', 'Meade', 'to', 'Ft.', 'Bragg', '(2007),', 'and', 'Ft.', 'Bragg', 'to', 'Joint', 'Base', 'San', 'Antonio', 'Randolph', '(2012)', '', '', 'Our', 'first', 'family', 'move,', 'my', 'kids', 'were', '2', 'yrs', 'old', 'and', '2', 'months', 'old;', 'respectively.', '', 'My', 'wife', 'and', 'I', 'were', '28.', '', '', '', 'For', 'the', 'first', 'move,', 'we', 'really', 'relied', 'on', 'sponsor', 'packages', 'and', 'any', 'material', 'we', 'could', 'find;', 'internet,', 'family', 'support', 'center,', 'etc.', '', 'We', 'purposely', 'stayed', 'on', 'base', 'because', 'we', 'had', 'no', 'idea', 'what', 'to', 'expect.', '', 'For', 'subsequent', 'moves,', 'the', 'internet', 'was', 'helpful,', 'but', 'we', 'also', 'relied', 'very', 'heavily', 'on', 'our', 'friend', 'networks', 'to', 'get', 'ground', 'truth', 'on', 'school', 'districts,', 'doctors,', 'neighborhoods,', 'commute', 'times,'

Bag of Words: create a dictionary for the number of times a word appears

In [208]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [209]:
count = 0 
for k, v in dictionary.iteritems():
    print(k,v)
    count += 1
    if count > 10: 
        break 

0 academ
1 actual
2 anxieti
3 area
4 autist
5 awar
6 brother
7 build
8 care
9 child
10 china


In [210]:
#gensim filter_extremes
#filter parameters:
no_below = 5  #absolute number
no_above = 0.4 # fraction of total corpus size 
dictionary.filter_extremes(no_below, no_above, keep_n=100000)

In [211]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[23]

[(4, 1),
 (37, 1),
 (40, 1),
 (43, 1),
 (51, 1),
 (52, 3),
 (58, 1),
 (65, 1),
 (66, 2),
 (68, 1),
 (73, 1),
 (76, 1),
 (79, 2),
 (80, 1),
 (83, 1),
 (85, 1),
 (86, 2),
 (93, 2),
 (104, 2),
 (107, 2),
 (110, 1),
 (127, 2),
 (139, 1),
 (140, 1),
 (146, 1),
 (150, 1),
 (156, 1),
 (158, 1),
 (167, 1),
 (182, 2),
 (187, 1),
 (195, 1),
 (197, 1),
 (205, 3),
 (234, 1),
 (235, 1),
 (236, 1),
 (255, 1),
 (264, 1),
 (267, 1),
 (276, 1),
 (283, 3),
 (284, 2),
 (287, 3),
 (294, 3),
 (307, 1),
 (311, 1),
 (316, 2),
 (323, 1),
 (333, 1),
 (350, 1),
 (365, 1),
 (386, 1)]

In [213]:
len(bow_corpus)

64

In [125]:
bow_doc_23 = bow_corpus[23]

for i in range(len(bow_doc_23)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_23[i][0], 
                                                     dictionary[bow_doc_23[i][0]], 
                                                     bow_doc_23[i][1]))

Word 4 ("build") appears 1 time.
Word 37 ("children") appears 1 time.
Word 40 ("close") appears 1 time.
Word 43 ("depend") appears 1 time.
Word 51 ("idea") appears 1 time.
Word 52 ("incom") appears 3 time.
Word 58 ("offer") appears 1 time.
Word 65 ("provid") appears 1 time.
Word 66 ("question") appears 2 time.
Word 68 ("relat") appears 1 time.
Word 73 ("travel") appears 1 time.
Word 76 ("wife") appears 1 time.
Word 79 ("call") appears 2 time.
Word 80 ("counselor") appears 1 time.
Word 83 ("find") appears 1 time.
Word 85 ("guidanc") appears 1 time.
Word 86 ("issu") appears 2 time.
Word 93 ("open") appears 2 time.
Word 104 ("allen") appears 2 time.
Word 107 ("booz") appears 2 time.
Word 110 ("choos") appears 1 time.
Word 127 ("right") appears 2 time.
Word 139 ("convers") appears 1 time.
Word 140 ("day") appears 1 time.
Word 146 ("involv") appears 1 time.
Word 150 ("month") appears 1 time.
Word 156 ("similar") appears 1 time.
Word 158 ("tell") appears 1 time.
Word 167 ("address") appears 

Tf-idf model

In [126]:
from gensim import corpora, models 

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint 

for doc in corpus_tfidf: 
    pprint(doc)
    break

[(0, 0.19404372710131615),
 (1, 0.19404372710131615),
 (2, 0.156207625816503),
 (3, 0.17322101797112616),
 (4, 0.19929175371604044),
 (5, 0.22088886818360837),
 (6, 0.09152638344939759),
 (7, 0.14182300771143277),
 (8, 0.22088886818360837),
 (9, 0.10853977560402078),
 (10, 0.14182300771143277),
 (11, 0.17322101797112616),
 (12, 0.14182300771143277),
 (13, 0.14182300771143277),
 (14, 0.19404372710131615),
 (15, 0.18305276689879518),
 (16, 0.312415251633006),
 (17, 0.09152638344939759),
 (18, 0.11837152453168984),
 (19, 0.09964587685802022),
 (20, 0.19404372710131615),
 (21, 0.12936248473421075),
 (22, 0.21707955120804157),
 (23, 0.19404372710131615),
 (24, 0.10853977560402078),
 (25, 0.10853977560402078),
 (26, 0.156207625816503),
 (27, 0.14182300771143277),
 (28, 0.17322101797112616),
 (29, 0.11837152453168984),
 (30, 0.17322101797112616),
 (31, 0.3464420359422523)]


Running LDA with Bag of Words

In [127]:
n = 5
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = n, 
                                       id2word=dictionary, passes=3, workers=2,
                                      per_word_topics=True)

Below are the identified topics from the data. I set the algorithm to choose the five most prevalent topics from all the interview text. You'll see that each  topic lists the word identified with each topic. The first topic, for instance, contains words like feel, understand, home, brother, and life. We can assume that this topic relates to Family and Emotion.  

In [128]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.012*"feel" + 0.012*"look" + 0.011*"understand" + 0.011*"home" + 0.011*"summer" + 0.011*"brother" + 0.010*"civilian" + 0.009*"daughter" + 0.009*"post" + 0.009*"life"
Topic: 1 
Words: 0.021*"armi" + 0.019*"train" + 0.016*"provid" + 0.014*"liaison" + 0.013*"transfer" + 0.013*"command" + 0.013*"state" + 0.012*"youth" + 0.011*"issu" + 0.011*"counti"
Topic: 2 
Words: 0.021*"right" + 0.021*"organ" + 0.018*"instal" + 0.018*"typic" + 0.018*"level" + 0.016*"guidanc" + 0.015*"command" + 0.015*"type" + 0.015*"look" + 0.013*"district"
Topic: 3 
Words: 0.019*"sponsor" + 0.014*"youth" + 0.014*"specif" + 0.014*"life" + 0.012*"packet" + 0.012*"anchor" + 0.012*"send" + 0.012*"reach" + 0.011*"train" + 0.011*"involv"
Topic: 4 
Words: 0.016*"navi" + 0.014*"older" + 0.013*"relationship" + 0.012*"network" + 0.012*"hard" + 0.012*"club" + 0.012*"involv" + 0.011*"diego" + 0.011*"middl" + 0.010*"build"


Topic 1: Family and emotion 

Topic 2: Military and command involvement in transitions

Topic 3: School and organizational support 

Topic 4: Youth sponsorship and support 

Topic 5: Relationships and social networks 

LDAvis

In [157]:
lda_model.save('iview_topics.model')

In [158]:
lda = gensim.models.ldamodel.LdaModel.load('iview_topics.model')

In [132]:
import pyLDAvis
import pyLDAvis.gensim

In [163]:
pyLDAvis.enable_notebook()

In [161]:
vis_data = pyLDAvis.gensim.prepare(lda, bow_corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


Below you'll see the first of the visualizations for the topic modeling. Each bubble represents a different topic. The size of each bubble represents marginal topic distribution - that is, how much of the text falls into these topics. Overlap between topics means they share common words. 2 and 3 are similar topics because there's an overlap of terms.

You should be able to mouse over each topic and see how different terms fall within them. If you go to the right and click on a word, it will show you how particular it is to a certain topic. This takes into account how often the term is grouped with that topic relative to the entire dataset of text. 



In [162]:
pyLDAvis.display(vis_data)

In [214]:
# everything below is still in the works...

In [170]:
#compute coherence score
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.2996651353484938


LDA Mallet Model

In [179]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_docs]

In [183]:
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    import gensim

    if not gensim.matutils.ismatrix(corpus):
        corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = gensim.matutils.Sparse2Corpus(corpus_csc)

    vocab = list(dictionary.token2id.keys())
    # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
    # for now, I'll just make sure we don't ever get zeros...
    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(dictionary),\
        'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(corpus),\
        'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[1] == num_topics,\
        'Document topics and number of topics do not match {} != {}'.format(
        doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]

    return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists,
            'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}


In [187]:
topic_model_data = _extract_data(lda_model, bow_corpus, dictionary, doc_topic_dists=None)

In [190]:
len(topic_model_data['doc_topic_dists'])
#sum ting wong

64