In [1]:
import nltk
nltk.download('wordnet')
nltk.download('words')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.chunk import ne_chunk
from nltk.chunk.util import tree2conlltags
nltk.download('maxent_ne_chunker')
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
from sklearn.metrics import pairwise_distances

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models, similarities, matutils

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/darienpmt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/darienpmt/nltk_data...
[nltk_data]   Package words is already up-to-date!

Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/metis/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/darienpmt/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/darienpmt/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
dtm2_lem = pd.read_pickle('./pickles/dtm2_lemmatized.pkl')

# Topic modeling with Count Vectorizer

## Latent Semantic Analysis (LSA)

### Function to display topics

In [3]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """
    Displays the top n terms in each topic
    """
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [4]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(dtm2_lem)


In [None]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = dtm2_lem.columns)


In [6]:
display_topics(lsa, dtm2_lem.columns, 5)


Topic  1
beloved wife, feeling fine, happy birthday, looking forward, philippine island

Topic  2
philippine island, looking forward, mother aunt, war end, aunt uncle

Topic  3
happy birthday, mother aunt, mother aunt uncle, aunt uncle, gentle breeze

Topic  4
mr taylor, great deal, gentle breeze, play golf, went visit

Topic  5
looking forward, far away, early morning, far away danger, away danger


In [None]:
doc_topic = pd.DataFrame(doc_topic.round(5),
             index = dtm2_lem.index,
             columns = ["component_1","component_2", "component_3","component_4", "component_5"])


## Non-Negative Matrix Factorization (NMF)

In [8]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(dtm2_lem)

In [9]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = dtm2_lem.columns)


In [10]:
display_topics(nmf_model, dtm2_lem.columns, 5)


Topic  1
beloved wife, happy birthday, little bit, feeling fine, special kiss

Topic  2
philippine island, overseas duty, war europe, discharged army, end war

Topic  3
looking forward, money order, united state, came overseas, war end

Topic  4
great deal, mr taylor, play golf, feeling fine, far away

Topic  5
mother aunt, aunt uncle, mother aunt uncle, gentle breeze, breeze blowing


## Topic Modeling with LDA

In [11]:
df_lem = pd.read_pickle('./pickles/lemmatized_df.pkl')

### Bring in custom stop words

In [12]:
stop_words = pickle.load(open('./pickles/stop_words_list.pkl', 'rb'))

In [None]:

count_vectorizer = CountVectorizer(ngram_range=(2, 3),  stop_words=stop_words, 
                                   min_df=3, max_df=0.7, token_pattern="\\b[a-z][a-z]+\\b")

count_vectorizer.fit(df_lem.Lemmatized_letter)

doc_word = count_vectorizer.transform(df_lem.Lemmatized_letter).transpose()

corpus = matutils.Sparse2Corpus(doc_word)

id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=5)
lda.print_topics()

# Topic modeling with TF-IDF

In [14]:
df_tfidf2_lem = pd.read_pickle('./pickles/tfidf2_lemmatized.pkl')

## LSA

In [15]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(df_tfidf2_lem)

In [16]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = df_tfidf2_lem.columns)


In [17]:
display_topics(lsa, df_tfidf2_lem.columns, 5)


Topic  1
beloved wife, looking forward, philippine island, feeling fine, far away

Topic  2
western union telegram, western union, union telegram, san francisco, mr tontar

Topic  3
beloved wife, wife oh, battalion surgeon, wrote wrote, feeling fine

Topic  4
looking forward, beloved wife, pursuit happiness, came overseas, duty ask

Topic  5
early morning, looking forward, far away, happiest man, send money


## NMF

In [18]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(df_tfidf2_lem)

In [19]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = df_tfidf2_lem.columns)


In [20]:
display_topics(nmf_model, df_tfidf2_lem.columns, 5)


Topic  1
feeling fine, mother aunt, breeze blowing, far away, aunt uncle

Topic  2
western union, western union telegram, union telegram, san francisco, mr tontar

Topic  3
beloved wife, duty ask, wife oh, adore beloved, adore beloved wife

Topic  4
looking forward, early morning, wife remember, wake morning, felt bad

Topic  5
philippine island, medical officer, near future, sky high, sorry disappoint


## LDA

In [None]:
tfidf2 = TfidfVectorizer(ngram_range=(2,3), binary=True, stop_words=stop_words, min_df=3, max_df=0.7)

tfidf2.fit(df_lem.Lemmatized_letter)

doc_word = tfidf2.transform(df_lem.Lemmatized_letter).transpose()

corpus = matutils.Sparse2Corpus(doc_word)

id2word = dict((v, k) for k, v in tfidf2.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=10)
lda.print_topics()

# Topic modeling with Nouns and Nouns/Adjectives only

## Nouns

In [22]:
df_nouns_lem = pd.read_pickle('./pickles/nouns_df_lemmatize.pkl')

### LSA

In [23]:
df_tfidf1_nouns = pd.read_pickle('./pickles/tfidf2_nouns_lemmatized.pkl')

In [24]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(df_tfidf1_nouns)

In [25]:
display_topics(lsa, df_tfidf1_nouns.columns, 5)


Topic  1
wife, feeling, happy, tell, war

Topic  2
island, jap, sea, ship, philippine

Topic  3
western, union, tontar, sea, telegram

Topic  4
western, union, tontar, telegram, francisco

Topic  5
birthday, native, tontar, weight, coconut


### NMF

In [26]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(df_tfidf1_nouns)

In [27]:
display_topics(nmf_model, df_tfidf1_nouns.columns, 5)


Topic  1
christmas, wife, money, send, golf

Topic  2
sea, ship, island, land, hill

Topic  3
western, union, tontar, telegram, francisco

Topic  4
war, overseas, expect, hospital, end

Topic  5
birthday, native, feeling, work, happy


### LDA

In [None]:
tfidf1 = TfidfVectorizer(stop_words=stop_words, min_df=3, max_df=0.7)

tfidf1.fit(df_nouns_lem.Lemmatized_letter)

doc_word = tfidf1.transform(df_nouns_lem.Lemmatized_letter).transpose()

corpus = matutils.Sparse2Corpus(doc_word)

id2word = dict((v, k) for k, v in tfidf1.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=10)

lda.print_topics()

## Nouns and Adjectives

In [29]:
df_nouns_adj_lem = pd.read_pickle('./pickles/nouns_adj__df_lemmatize.pkl')

### LSA

In [30]:
df_tfidf2_nouns_adj = pd.read_pickle('./pickles/tfidf2_nouns_adj_lemmatized.pkl')

In [31]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(df_tfidf2_nouns_adj)

In [32]:
display_topics(lsa, df_tfidf2_nouns_adj.columns, 5)


Topic  1
wife, feeling, happy, tell, war

Topic  2
western union telegram, western union, union telegram, western, union

Topic  3
western union telegram, union telegram, western union, western, union

Topic  4
war, hospital, medical, money, expect

Topic  5
birthday, native, happy birthday, coconut, rat


I feel like this would be pretty good if it weren't for that overlap seen in topics 2 and 3. Oh well.

### NMF

In [82]:
nmf_model = NMF(6, random_state=0)
doc_topic = nmf_model.fit_transform(df_tfidf2_nouns_adj)

In [83]:
display_topics(nmf_model, df_tfidf2_nouns_adj.columns, 10)


Topic  1
war, hospital, expect, medical, overseas, end, news, duty, officer, change

Topic  2
christmas, picture, morning, went, dinner, cold, golf, played, people, warm

Topic  3
union telegram, western union, western union telegram, western, union, tontar, telegram, francisco, san francisco, san

Topic  4
ship, sea, land, typhoon, calm, tonight, kure, morning, bay, island

Topic  5
jap, island, native, philippine, rain, philippine island, enemy, campaign, wet, landed

Topic  6
birthday, wife, beloved, send, happy, beloved wife, kiss, remember, feeling, able


## Human interpretation of topics

I think these are my most clear topics. After much thought, here is how I interpret it:

1. Duty
2. Comfort/Normalization
3. Money
4. Traveling at sea  
5. Inner Conflict/Struggle
6. Romance

In [84]:
topic_word = pd.DataFrame(nmf_model.components_.round(5),
             index = ["component_1","component_2", "component_3","component_4", "component_5",
                     "component_6"],
             columns = df_tfidf2_nouns_adj.columns)

In [85]:
doc_topic = pd.DataFrame(doc_topic.round(5),
             index = df_tfidf2_nouns_adj.index,
             columns = ["component_1","component_2", "component_3","component_4", "component_5",
                       "component_6"])


In [86]:
doc_topic.index = pd.to_datetime(doc_topic.index)

## Plotting topics over time

In [87]:
doc_topic_qrts = doc_topic.resample('3M').mean()

topics = doc_topic_qrts.columns

In [90]:
doc_topic_qrts.columns = ['Duty', 'Comfort& Normalization', 'Money' , 'Traveling at Sea',
                          'Inner Conflict & Struggle', 'Romance']

In [92]:
doc_topic_qrts.drop('Money', axis=1, inplace=True)

In [94]:
# save this dataframe to csv for tableau visualization

doc_topic_qrts.to_csv('Letter_topics.csv')

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for topic in topics:
    plt.plot(doc_topic_qrts[topic])

plt.title('Topics seen though Time', fontsize=20, weight='bold', pad=20)
plt.xlabel('Date', fontsize=15, weight='bold',labelpad=10)
plt.ylabel('Prevelence of Topic', fontsize=15,weight='bold',labelpad=10)
plt.legend(topics)


In [None]:
# find letter dates which are most representative of a particular topic

doc_topic.component_4.sort_values(ascending=False).head(1)

## Trying a basic recommender system (not sure if this makes sense here)

In [42]:
df_nouns_adj = pd.read_pickle('./pickles/nouns_adj__df_lemmatize.pkl')

tfidf2 = TfidfVectorizer(ngram_range=(1,3), stop_words=stop_words, min_df=3, max_df=0.7)

tfidf2_nouns_adj = tfidf2.fit_transform(df_lem.Lemmatized_letter)

In [43]:
t = ["the war is over"]

vt = tfidf2.transform(t)

In [44]:
tt = nmf_model.transform(vt)

In [None]:
pairwise_distances(tt,doc_topic,metric='cosine').argsort()

In [46]:
doc_topic.index[268]

Timestamp('1945-07-31 00:00:00')

### LDA

In [None]:
tfidf2 = TfidfVectorizer(ngram_range=(2,3), binary=True, stop_words=stop_words, min_df=3, max_df=0.7)

tfidf2.fit(df_nouns_adj_lem.Lemmatized_letter)

doc_word = tfidf2.transform(df_nouns_adj_lem.Lemmatized_letter).transpose()

corpus = matutils.Sparse2Corpus(doc_word)

id2word = dict((v, k) for k, v in tfidf2.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=10)

lda.print_topics()

This probably does not work as well since the documents are rather small. It makes sense that NMF (or LSA) would give more clear results.

# Topic modeling corpus by location

In [48]:
unq_locations = ['Australia', 'Southwest Pacific', 'New Guinea',
       'Philippine Islands', 'Zamboanga', 'Japan']

In [49]:
tfidf_aus = pd.read_pickle('./pickles/tfidf2_Australia.pkl')

tfidf_swp = pd.read_pickle('./pickles/tfidf2_Southwest Pacific.pkl')

tfidf_ng = pd.read_pickle('./pickles/tfidf2_New Guinea.pkl')

tfidf_pi = pd.read_pickle('./pickles/tfidf2_Philippine Islands.pkl')

tfidf_zam = pd.read_pickle('./pickles/tfidf2_Zamboanga.pkl')

tfidf_japan = pd.read_pickle('./pickles/tfidf2_Japan.pkl')

In [50]:
locations = [tfidf_aus, tfidf_swp, tfidf_ng, tfidf_pi, tfidf_zam, tfidf_japan]

In [51]:
for name, tfidf in zip(unq_locations, locations):
    
    nmf_model = NMF(4)
    doc_topic = nmf_model.fit_transform(tfidf)
    
    print(name)
    display_topics(nmf_model, tfidf.columns, 5)
    print('\n')

Australia

Topic  1
wife, birthday, golf, able, mother

Topic  2
tontar, western union, western union telegram, western, union telegram

Topic  3
school, trip, camp, train, course

Topic  4
christmas, picture, child, morning, holiday


Southwest Pacific

Topic  1
christmas, money, tonight, thing, island

Topic  2
native, child, sea, hut, patient

Topic  3
news, coffee, cup, meal, sheet

Topic  4
book, people, jap, great, girl


New Guinea

Topic  1
native, air, picture, book, husband

Topic  2
rat, trap, rat trap, chaplain, man

Topic  3
birthday, happy birthday, food, wonderful, happy

Topic  4
wife, able, anniversary, people, past


Philippine Islands

Topic  1
jap, point, campaign, rain, mud

Topic  2
war, happy, news, york, book

Topic  3
morning, native, care, hot, warm

Topic  4
dry, weather, men, wind, people


Zamboanga

Topic  1
hospital, world, medical, officer, doctor

Topic  2
future, philippine island, island, quiet, philippine

Topic  3
sure, work, great, usual, trip

Top

This is interesting, but maybe a bit too much for my purposes right now.