In [1]:
import nltk
nltk.download('wordnet')
import re
import string

import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pandas as pd
import numpy as np

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/darienpmt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing

## Read in the raw text 

In [123]:
df = pd.read_csv('letters_df.csv', index_col=0)

## Pre-processing text

In [None]:
# Text preprocessing remove numbers, captial letters and punctuation

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df.Letter = df.Letter.map(alphanumeric).map(punc_lower)


### Removing their names

In [4]:
df.Letter = df.Letter.str.replace('silvio', '')

In [5]:
df.Letter = df.Letter.str.replace('annette', '')

### Removing references to the month

In [6]:
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september',
                       'october', 'november', 'december']

for month in months:
    df.Letter = df.Letter.str.replace(month, '')

### Removing references to the location

In [7]:
locations = ['australia', 'new guinea', 'southwest pacific', 'pacific', 'phillippine islands', 'mindanao', 
             'zamboanga', 'somewhere in the pacific', 'leyte gulf', 'okinawa', 'japan', 'hiro', 'california']

for location in locations:
    df.Letter = df.Letter.str.replace(location, '')


### Pickle unstemmed dataframe

In [8]:
df.to_pickle('./pickles/unstemmed_df.pkl')

## Document Term Matrix of the plain text (no stemming or lemmatizing)

In [9]:
df_plain = pd.read_pickle('./pickles/unstemmed_df.pkl')

In [21]:
cv = CountVectorizer(stop_words='english')

df_plain_cv = cv.fit_transform(df_plain.Letter)
df_plain_dtm1 = pd.DataFrame(df_plain_cv.toarray(), columns=cv.get_feature_names())
df_plain_dtm1.index = df_plain.Date

In [11]:
df_plain_dtm1.to_pickle('./pickles/plain_df.pkl')

## Stemming

In [12]:
from nltk.stem.lancaster import LancasterStemmer

stemmer = LancasterStemmer()

In [13]:
df['Stemmed_letter'] = df.Letter.apply(lambda x: ' '.join([stemmer.stem(y) for y in x.split()]))

In [14]:
df.drop('Letter', axis=1, inplace=True)

### Pickle Stemmed dataframe

In [15]:
df.to_pickle('./pickles/stemmed_df.pkl')

## Lemmatizing

In [16]:
df_lem = pd.read_pickle('./pickles/unstemmed_df.pkl')

In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [18]:
df_lem['Lemmatized_letter'] = df_lem.Letter.apply(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split()]))

In [19]:
df_lem.drop('Letter', axis=1, inplace=True)

### Pickle Lemmatized dataframe

In [26]:
df_lem.to_pickle('./pickles/lemmatized_df.pkl')

## Extract Parts of Speech

### Nouns DataFrame

In [None]:
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
df_nouns = pd.DataFrame(df.Letter.apply(nouns))
df_nouns.index = df.Date

#### Pickle Nouns df

In [None]:
df_nouns.to_pickle('./pickles/nouns_df_rawtext.pkl')

### Lemmatize Nouns df

In [None]:
lemmatizer = WordNetLemmatizer()

df_nouns['Lemmatized_letter'] = df_nouns.Letter.apply(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split()]))

In [None]:
# pickle the nouns df lemmatized
df_nouns.drop('Letter', axis=1, inplace=True)

df_nouns.to_pickle('./pickles/nouns_df_lemmatize.pkl')

### Adjectives and Nouns DataFrame

In [None]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:
df_nouns_adj = pd.DataFrame(df.Letter.apply(nouns_adj))
df_nouns_adj.index = df.Date

#### Pickle Nouns/Adjs df

In [None]:
df_nouns_adj.to_pickle('./pickles/nouns_adj__df_rawtext.pkl')

### Lemmatize Nouns/Adj df

In [None]:
lemmatizer = WordNetLemmatizer()

df_nouns_adj['Lemmatized_letter'] = df_nouns_adj.Letter.apply(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split()]))

df_nouns_adj.drop('Letter', axis=1, inplace=True)

In [None]:
# pickle the nouns/adj df lemmatized

df_nouns_adj.to_pickle('./pickles/nouns_adj__df_lemmatize.pkl')

## Location DataFrames

In [None]:
df_nouns_adj['Location'] = np.array(df['Location'])

### Function to make each new df

In [None]:
def loc_df(df, location):
    return df[df.Location == location]

### Make df for each location

In [None]:
aus_df = loc_df(df_nouns_adj, 'Australia')

swp_df = loc_df(df_nouns_adj, 'Southwest Pacific')

ng_df = loc_df(df_nouns_adj, 'New Guinea')

pi_df = loc_df(df_nouns_adj, 'Philippine Islands')

z_df = loc_df(df_nouns_adj, 'Zamboanga')

j_df = loc_df(df_nouns_adj, 'Japan')

### Pickle each new df

In [None]:
loc_dfs = [aus_df, swp_df, ng_df, pi_df, z_df, j_df]

unq_locations = ['Australia', 'Southwest Pacific', 'New Guinea',
       'Philippine Islands', 'Zamboanga', 'Japan']

In [None]:
for name, df in zip(unq_locations, loc_dfs):
    df.to_pickle('./pickles/{}.pkl'.format(name))

# Count Vectorizer

## Document term matrix from Stemmed text (tokenized by word)

### Bring in custom stop words

In [56]:
stop_words = pickle.load(open('./pickles/stop_words_list.pkl', 'rb'))

In [57]:
cv1 = CountVectorizer(stop_words=stop_words, min_df=3, max_df=0.7 )

df_cv1 = cv1.fit_transform(df.Stemmed_letter)
df_dtm1 = pd.DataFrame(df_cv1.toarray(), columns=cv1.get_feature_names())
df_dtm1.index = df.Date

In [58]:
df_dtm1.to_pickle('./pickles/dtm1_stemmed.pkl')

## Document term matrix from Lemmatized text (tokenized by word)

In [59]:
cv1 = CountVectorizer(stop_words=stop_words, min_df=3, max_df=0.7 )

df_lem_cv1 = cv1.fit_transform(df_lem.Lemmatized_letter)
df_lem_dtm1 = pd.DataFrame(df_lem_cv1.toarray(), columns=cv1.get_feature_names())
df_lem_dtm1.index = df_lem.Date

In [60]:
df_lem_dtm1.to_pickle('./pickles/dtm1_lemmatized.pkl')

## Document Term Matrix from Stemmed text (tokenized by 2,3-grams)

In [61]:
cv2 = CountVectorizer(ngram_range=(2,3), stop_words=stop_words, min_df=3, max_df=0.7 )

df_cv2 = cv2.fit_transform(df.Stemmed_letter)
df_dtm2 = pd.DataFrame(df_cv2.toarray(), columns=cv2.get_feature_names())
df_dtm2.index = df.Date

In [62]:
df_dtm2.to_pickle('./pickles/dtm2_stemmed.pkl')

## Document Term Matrix from Lemmatized text (tokenized by 2,3-grams)

In [63]:
cv2 = CountVectorizer(ngram_range=(2,3), stop_words=stop_words, min_df=3, max_df=0.7 )

df_lem_cv2 = cv2.fit_transform(df_lem.Lemmatized_letter)
df_lem_dtm2 = pd.DataFrame(df_lem_cv2.toarray(), columns=cv2.get_feature_names())
df_lem_dtm2.index = df_lem.Date

In [66]:
df_lem_dtm2.to_pickle('./pickles/dtm2_lemmatized.pkl')

# TF-IDF

## TF-IDF with lemmatization

In [67]:
# one word

tfidf1 = TfidfVectorizer(stop_words=stop_words, min_df=3, max_df=0.7)

tfidf1_lem = tfidf1.fit_transform(df_lem.Lemmatized_letter)
df_tfidf1_lem = pd.DataFrame(tfidf1_lem.toarray(), columns=tfidf1.get_feature_names())
df_tfidf1_lem.index = df_lem.Date

In [68]:
df_tfidf1_lem.to_pickle('./pickles/tfidf1_lemmatized.pkl')

In [69]:
# two to three words

tfidf2 = TfidfVectorizer(ngram_range=(2,3), binary=True, stop_words=stop_words, min_df=3, max_df=0.7)

tfidf2_lem = tfidf2.fit_transform(df_lem.Lemmatized_letter)
df_tfidf2_lem = pd.DataFrame(tfidf2_lem.toarray(), columns=tfidf2.get_feature_names())
df_tfidf2_lem.index = df_lem.Date

In [70]:
df_tfidf2_lem.to_pickle('./pickles/tfidf2_lemmatized.pkl')

# TF-IDF for nouns and adj dataframe

## Bring in nouns df

In [99]:
df_nouns = pd.read_pickle('./pickles/nouns_df_lemmatize.pkl')

In [100]:
# one word

tfidf1 = TfidfVectorizer(stop_words=stop_words, min_df=3, max_df=0.7)

tfidf1_nouns = tfidf1.fit_transform(df_lem.Lemmatized_letter)
df_tfidf1_nouns = pd.DataFrame(tfidf1_nouns.toarray(), columns=tfidf1.get_feature_names())
df_tfidf1_nouns.index = df_nouns.index

In [101]:
df_tfidf1_nouns.to_pickle('./pickles/tfidf1_nouns_lemmatized.pkl')

## Bring in nouns/adj df

In [102]:
df_nouns_adj = pd.read_pickle('./pickles/nouns_adj__df_lemmatize.pkl')

In [103]:
tfidf2 = TfidfVectorizer(ngram_range=(1,3), stop_words=stop_words, min_df=3, max_df=0.7)

tfidf2_nouns_adj = tfidf2.fit_transform(df_lem.Lemmatized_letter)
df_tfidf2_nouns_adj = pd.DataFrame(tfidf2_nouns_adj.toarray(), columns=tfidf2.get_feature_names())
df_tfidf2_nouns_adj.index = df_nouns_adj.index

In [104]:
df_tfidf2_nouns_adj.to_pickle('./pickles/tfidf2_nouns_adj_lemmatized.pkl')

# TF-IDF for each location

## Read in each df

In [83]:
aus_df = pd.read_pickle('./pickles/Australia.pkl')

In [85]:
swp_df = pd.read_pickle('./pickles/Southwest Pacific.pkl')

In [87]:
ng_df = pd.read_pickle('./pickles/New Guinea.pkl')

In [90]:
pi_df = pd.read_pickle('./pickles/Philippine Islands.pkl')

In [92]:
z_df = pd.read_pickle('./pickles/Zamboanga.pkl')

In [94]:
j_df = pd.read_pickle('./pickles/Japan.pkl')

In [117]:
loc_dfs = [aus_df, swp_df, ng_df, pi_df, z_df, j_df]

In [118]:
unq_locations = ['Australia', 'Southwest Pacific', 'New Guinea',
       'Philippine Islands', 'Zamboanga', 'Japan']

## Make the TF-IDF for each location

In [119]:
tfidf2 = TfidfVectorizer(ngram_range=(1,3), stop_words=stop_words, min_df=3, max_df=0.7)

for name, loc_df in zip(unq_locations, loc_dfs):

    tfidf2_local = tfidf2.fit_transform(loc_df.Lemmatized_letter)
    df_tfidf1_local = pd.DataFrame(tfidf2_local.toarray(), columns=tfidf2.get_feature_names())
    df_tfidf1_local.index = loc_df.index
    
    df_tfidf1_local.to_pickle('./pickles/tfidf2_{}.pkl'.format(name))

In [120]:
test = pd.read_pickle('./pickles/tfidf2_Australia.pkl')