Tom Halloin <br> Springboard Data Science Career Track <br>

<h1 align="center">Capstone Project 2: Analysis of Berkshire Hathaway Shareholder Letters Using Natural Language Processing (NLP) Techniques</h1>

<h3 align='center'> Part 5: Topic Modelling</h3> <br>

In [1]:
# Adapted from https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [2]:
# You may need to install the following:
# !pip install gensim
# !pip install pyLDAvis

In [3]:
# import pandas as pd
# file = 'C:/Users/Tom/Documents/Berkshire/topic_modelling/sample_topics.csv'
# text = pd.read_csv(file)

'''
Prior versions stored each file in a giant string of text. Change to storing each year in dictionary.
'''

letter_dictionary = dict()
for year in range(1977, 2019):
    file = f'C:/Users/Tom/Documents/Berkshire/pdf_files/clean_letters/{year}_letter.txt'
    with open(file=file) as f:
        letter_dictionary[str(year)] = str(f.readlines())    

In [4]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [5]:
# We use NLTK’s Wordnet to find the meanings of words, synonyms, antonyms, and more. In addition, we use WordNetLemmatizer 
# to get the root word.

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Filter out stopwords

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Preparing text for LDA topic modelling

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
# file is given earlier in notebook
import random
text_data = []

#with open(file) as f:
for year in letter_dictionary:
    tokens = prepare_text_for_lda(letter_dictionary[year])
    [text_data.append(tokens) for token in tokens if random.random() > .9]
        

In [9]:
# First, we are creating a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus
# for future use.

from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [10]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
tfidfmodel = gensim.models.tfidfmodel.TfidfModel(corpus=corpus, id2word=dictionary)


In [11]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model10.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [12]:
pyLDAvis.show(lda_display, ip='127.0.0.1', port=8888, n_retries=50, local=True, open_browser=True, http_server=None)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8891/    [Ctrl-C to exit]


127.0.0.1 - - [19/Jan/2020 15:59:17] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [19/Jan/2020 15:59:17] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [19/Jan/2020 15:59:17] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [19/Jan/2020 15:59:17] "GET /LDAvis.js HTTP/1.1" 200 -
127.0.0.1 - - [19/Jan/2020 15:59:17] code 404, message Not Found
127.0.0.1 - - [19/Jan/2020 15:59:17] "GET /favicon.ico HTTP/1.1" 404 -



stopping Server...
