## Function to lemmatize

In [6]:
import nltk
import string
import re

nltk.download('stopwords')
nltk.download('wordnet')

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

def remove_stopwords(tokenized_list):
    stopword = nltk.corpus.stopwords.words('english')
    text = [word for word in tokenized_list if word not in stopword]
    return text

def lemmatizing(tokenized_text):
    wn = nltk.WordNetLemmatizer() #you'll need to download wordnet from nltk
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

def lemmatize(in_path, out_path, file):
    '''
    takes a text file, strips it of punctuation, then tokenizes the data.
    After tokenization, the stop words are removed.
    Then the lemmas of each word is found and returned as a list.
    '''
    with open(in_path + '/' + file) as f:
        rawData = f.read()
        cleanData = remove_punct(rawData)
        tokenized = tokenize(cleanData.lower())
        text_no_stop = remove_stopwords(tokenized)
        text_lemmatized = lemmatizing(text_no_stop)
    out_file = file[:-4] + '_lemmatized.txt' #way to create lemmatized file name
    with open(out_path + '/' + out_file, "w") as output:
        for word in text_lemmatized:
            output.write(str(word) + '\n')
    #in here for validation
    #return lemmatizing(text_no_stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabriellegustilo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gabriellegustilo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
with open('/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/state-of-the-union-corpus-1989-2017/Adams1_1797.txt') as f:
    rawData = f.read()
    print(rawData[:100])

Gentlemen of the Senate and Gentlemen of the House of Representatives:

I was for some time apprehen


## Looping through files in a directory

In [9]:
in_path = '/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/state-of-the-union-corpus-1989-2017'
out_path = '/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/state-of-the-union-lemmatized'

In [10]:
#need this to get the file names...probably more efficient way to do this, but ehhhh...
from pathlib import Path
path = Path("/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/state-of-the-union-corpus-1989-2017")

### Get all the file names

In [11]:
import os

files = [i for i in os.listdir(path) if i.endswith("txt")]

In [12]:
files

['Reagan_1982.txt',
 'Roosevelt_1902.txt',
 'Wilson_1914.txt',
 'Taft_1911.txt',
 'Madison_1814.txt',
 'Polk_1848.txt',
 'Jackson_1836.txt',
 'Johnson_1969.txt',
 'Hoover_1930.txt',
 'Pierce_1854.txt',
 'Pierce_1855.txt',
 'Hoover_1931.txt',
 'Johnson_1968.txt',
 'Madison_1815.txt',
 'Taft_1910.txt',
 'Buren_1837.txt',
 'Wilson_1915.txt',
 'Roosevelt_1903.txt',
 'Reagan_1983.txt',
 'Roosevelt_1901.txt',
 'Taft_1912.txt',
 'Wilson_1917.txt',
 'Jackson_1835.txt',
 'Pierce_1856.txt',
 'Hoover_1932.txt',
 'Adams1_1797.txt',
 'Jackson_1834.txt',
 'Madison_1816.txt',
 'Wilson_1916.txt',
 'Reagan_1984.txt',
 'Roosevelt_1904.txt',
 'Roosevelt_1938.txt',
 'Fillmore_1852.txt',
 'Madison_1812.txt',
 'Jackson_1830.txt',
 'Truman_1949.txt',
 'Pierce_1853.txt',
 'Truman_1948.txt',
 'Jackson_1831.txt',
 'Madison_1813.txt',
 'Wilson_1913.txt',
 'Roosevelt_1939.txt',
 'Roosevelt_1905.txt',
 'Reagan_1985.txt',
 'Harrison_1889.txt',
 'Reagan_1987.txt',
 'Roosevelt_1907.txt',
 'Adams1_1800.txt',
 'Madison

### Now loop through all of them and write to the folder!

In [14]:
for file in files:
    lemmatize(in_path, out_path, file)

In [15]:
# Importing modules
import pandas as pd

# Read data into papers
speeches = pd.read_csv('/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/speeches.csv')

# Print head
speeches.head()


Unnamed: 0.1,Unnamed: 0,year,name,text
0,1,1797,Adams1,GENTLEMEN OF THE CONGRESS:\n\nWhen I addressed...
1,2,1798,Adams1,GENTLEMEN OF THE CONGRESS:\n\nWhen I addressed...
2,3,1799,Adams1,GENTLEMEN OF THE CONGRESS:\n\nWhen I addressed...
3,4,1800,Adams1,GENTLEMEN OF THE CONGRESS:\n\nWhen I addressed...
4,5,1825,Adams2,GENTLEMEN OF THE CONGRESS:\n\nWhen I addressed...


In [None]:
import re
# Remove punctuation
speeches['text_processed'] = speeches['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
speeches['text_processed'] = speeches['text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
speeches['text_processed'].head()

In [None]:
# Import the wordcloud library
import wordcloud
from wordcloud import WordCloud #need to pip install this
# Join the different processed titles together.

long_string = ','.join(list(speeches['text'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

In [None]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(speeches['text_processed'])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 10
number_words = 30

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
from pyLDAvis import sklearn as sklearn_lda #need to pip install this
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('../ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '../ldavis_prepared_'+ str(number_topics) +'.html')

In [None]:
# DO ALL OF THE ABOVE BUT WITH LEMMATIZED DATA

# Importing modules
import pandas as pd

# Read data into papers
speeches = pd.read_csv('/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/speeches.csv')

# Print head
speeches.head()

import re
# Remove punctuation
speeches['lemmatizedText'] = speeches['lemmatizedText'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
speeches['lemmatizedText'] = speeches['lemmatizedText'].map(lambda x: x.lower())
# Print out the first rows of papers
speeches['lemmatizedText'].head()

# Import the wordcloud library
import wordcloud
from wordcloud import WordCloud #need to pip install this
# Join the different processed titles together.

long_string = ','.join(list(speeches['lemmatizedText'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

In [None]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(speeches['lemmatizedText'])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

In [4]:
# we can see that the lemmatized data doesn't do anything different, so we stop here
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Import the wordcloud library
import wordcloud
from wordcloud import WordCloud #need to pip install this
# Join the different processed titles together.

# Importing modules
import pandas as pd
from sklearn.feature_extraction import text 
import re

# Read data into papers
speeches = pd.read_csv('/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/speeches.csv')

# Print head
speeches.head()

my_stop_words = ["government", "congress", "faith", "democracy", "year", "annual", "people", "utter"]
stop_words = frozenset(text.ENGLISH_STOP_WORDS.union(my_stop_words))

# Remove punctuation
speeches['text_processed'] = speeches['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
speeches['text_processed'] = speeches['text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
speeches['text_processed'].head()

def analyze_each_speech():
    for i in range(len(speeches['text_processed'])):
        print(speeches['name'][i])
        print(speeches['year'][i])
        # Create a WordCloud object
        wordcloud = WordCloud(stopwords=stop_words, background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

        # Generate a word cloud
        wordcloud.generate(speeches['text_processed'][i])
    

        # Visualize the word cloud
        wordcloud.to_image()
        filename = "./wordcloud_img_stopwords/%s_%s_wordcloud.png" %(speeches['name'][i], speeches['year'][i])
        wordcloud.to_file(filename)

analyze_each_speech()

Adams1
1797
Adams1
1798
Adams1
1799
Adams1
1800
Adams2
1825
Adams2
1826
Adams2
1827
Adams2
1828
Arthur
1881
Arthur
1882
Arthur
1883
Arthur
1884
Buchanan
1857
Buchanan
1858
Buchanan
1859
Buchanan
1860
Buren
1837
Buren
1838
Buren
1839
Buren
1840
Bush1
1989
Bush1
1990
Bush1
1991
Bush1
1992
Bush2
2001
Bush2
2002
Bush2
2003
Bush2
2004
Bush2
2005
Bush2
2006
Bush2
2007
Bush2
2008
Carter
1978
Carter
1979
Carter
1980
Carter
1981
Cleveland
1885
Cleveland
1886
Cleveland
1887
Cleveland
1888
Cleveland
1893
Cleveland
1894
Cleveland
1895
Cleveland
1896
Clinton
1993
Clinton
1994
Clinton
1995
Clinton
1996
Clinton
1997
Clinton
1998
Clinton
1999
Clinton
2000
Coolidge
1923
Coolidge
1924
Coolidge
1925
Coolidge
1926
Coolidge
1927
Coolidge
1928
Eisenhower
1954
Eisenhower
1955
Eisenhower
1956
Eisenhower
1957
Eisenhower
1958
Eisenhower
1959
Eisenhower
1960
Eisenhower
1961
Fillmore
1850
Fillmore
1851
Fillmore
1852
Ford
1975
Ford
1976
Ford
1977
Grant
1869
Grant
1870
Grant
1871
Grant
1872
Grant
1873
Grant
1874
Gr

In [17]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
import re
# Importing modules
import pandas as pd

# Read data into papers
speeches = pd.read_csv('/Users/gabriellegustilo/Dev/personal/machine_learning/final_project/speeches.csv')

# Print head
speeches.head()

# Remove punctuation
speeches['text_processed'] = speeches['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
speeches['text_processed'] = speeches['text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
speeches['text_processed'].head()

# Tweak the two parameters below
number_topics = 10
number_words = 5

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)

my_stop_words = ["government", "congress", "faith", "democracy", "year", "annual", "people", "utter"]

# Helper function
def print_topics(model, count_vectorizer, n_top_words, filename):
    words = count_vectorizer.get_feature_names()
    with open(filename, 'w') as f:
        for topic_idx, topic in enumerate(model.components_):
            f.write(''.join(("\n", " ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))))  


# Helper function
def save_topics_from_most_common_words(count_data, count_vectorizer, name):    
    lda.fit(count_data)

    # Print the topics found by the LDA model

    print_topics(lda, count_vectorizer, number_words, name)
    

def get_topics_for_each_speech():
    for i in range(len(speeches['text_processed'])):
        print(speeches['name'][i])
        print(speeches['year'][i])
        #print(speeches['text_processed'][i])
        
        stop_words = frozenset(text.ENGLISH_STOP_WORDS.union(my_stop_words))
        
        # Initialise the count vectorizer with the English stop words
        count_vectorizer = CountVectorizer(stop_words=stop_words)

        # Fit and transform the processed titles
        count_data = count_vectorizer.fit_transform(speeches['text_processed'][i].splitlines())
        
        # get filename
        filename = "./topics/%s_%s_topics.txt" %(speeches['name'][i], speeches['year'][i])

        # Visualise the 10 most common words
        save_topics_from_most_common_words(count_data, count_vectorizer, filename)


get_topics_for_each_speech()

Adams1
1797
Adams1
1798
Adams1
1799
Adams1
1800
Adams2
1825
Adams2
1826
Adams2
1827
Adams2
1828
Arthur
1881
Arthur
1882
Arthur
1883
Arthur
1884
Buchanan
1857
Buchanan
1858
Buchanan
1859
Buchanan
1860
Buren
1837
Buren
1838
Buren
1839
Buren
1840
Bush1
1989
Bush1
1990
Bush1
1991
Bush1
1992
Bush2
2001
Bush2
2002
Bush2
2003
Bush2
2004
Bush2
2005
Bush2
2006
Bush2
2007
Bush2
2008
Carter
1978
Carter
1979
Carter
1980
Carter
1981
Cleveland
1885
Cleveland
1886
Cleveland
1887
Cleveland
1888
Cleveland
1893
Cleveland
1894
Cleveland
1895
Cleveland
1896
Clinton
1993
Clinton
1994
Clinton
1995
Clinton
1996
Clinton
1997
Clinton
1998
Clinton
1999
Clinton
2000
Coolidge
1923
Coolidge
1924
Coolidge
1925
Coolidge
1926
Coolidge
1927
Coolidge
1928
Eisenhower
1954
Eisenhower
1955
Eisenhower
1956
Eisenhower
1957
Eisenhower
1958
Eisenhower
1959
Eisenhower
1960
Eisenhower
1961
Fillmore
1850
Fillmore
1851
Fillmore
1852
Ford
1975
Ford
1976
Ford
1977
Grant
1869
Grant
1870
Grant
1871
Grant
1872
Grant
1873
Grant
1874
Gr