# Setup

In [1]:
# data wrangling
import pandas as pd

# regular expressions
import re

# lemmatisation 
import spacy 

# n-grams 
import gensim
from gensim.models import Phrases 
from gensim.models import phrases
from gensim.utils import simple_preprocess

# stopwords
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS 

# word counter
from collections import Counter

# Data Pre-Processing

In [2]:
# import data
df = pd.read_excel("obama_speeches.xlsx", index_col = 0)

## Noise Removal

In [3]:
def remove_document_formatting(text):
    # separate line breaks from words
    p1 = re.compile(r'(\n)([A-Za-z\\[])') 
    text = re.sub(p1, r"\1 \2", text) 
    p2 = re.compile(r'([A-Za-z])(\n)') 
    text = re.sub(p2, r"\1 \2", text)

    # remove footer
    text = re.sub('(AAm|AmericanRhetoric\.com)\s((.||\n)*?)\sPage\s\d{1,2}', '', text)
    text = re.sub('(meerriiccaannR)\s((.||\n)*?)\s(Property)', '', text)

    # remove everything up until (and including) the sentence with the date of the speech
    text = re.sub(r'^((.|\n)*)\s(\d{1,2}\s{1,2}[a-zA-Z]{3,9},?\s\d{4},?)\s.*\s\n', '', text)  

    # remove everything up until (and including) the statement about transcription
    text = re.sub('^((.|\n)*)\s(\\[?AUTHENTICITY)\s.*\s\n', '', text)  

    # remove line breaks
    text = re.sub("\n", "", text) 
    
    # remove multiple white spaces
    text = re.sub("\s+", " ", text)
    
    # lower case
    text = text.lower() 
    
    return text

In [4]:
no_noise = [remove_document_formatting(text) for text in df['speech']]

## N-Grams

In [5]:
def tokenisation(texts):
    n_grams_list = []
    for text in texts:
        new = simple_preprocess(text, deacc = True)
        n_grams_list.append(new)
    return n_grams_list


def find_n_grams(texts):
    new_data = tokenisation(texts)
    bigram_phrases = Phrases(new_data, min_count = 5, threshold = 120, 
                             connector_words = phrases.ENGLISH_CONNECTOR_WORDS)

    trigram_phrases = Phrases(bigram_phrases[new_data], threshold = 120, 
                              connector_words = phrases.ENGLISH_CONNECTOR_WORDS)
    
    bigram = phrases.Phraser(bigram_phrases)
    
    trigram = phrases.Phraser(trigram_phrases)
    
    ngrams = [bigram[text] for text in new_data]
    ngrams = [trigram[bigram[text]] for text in new_data]
    
    return ngrams

In [6]:
n_grams = find_n_grams(no_noise)

## Text Normalisation

In [7]:
def text_normalisation(texts):
    nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
    postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    normalised_data = []

    for word in texts:
            doc = nlp(" ".join(word)) 
            normalised_data.append([token.lemma_ for token in doc if token.pos_ in postags])
    
    return normalised_data

In [8]:
normalised = text_normalisation(n_grams)

## Stopword Removal

In [9]:
extension = []
def stopword_removal(texts):
    no_stopwords = []
    remove_sw_from_list = {"bill"}
    stopwords = STOPWORDS.difference(remove_sw_from_list)
    stopwords = stopwords.union(set(extension))
    
    for text in texts:
        # remove stopwords
        no_stops_text = [word for word in text if word not in stopwords]
        no_stops_text = " ".join( no_stops_text)
        
        # remove punctuation 
        no_puncts = re.sub("[^a-zA-Z\d\s/\_]", "", no_stops_text)
        
        no_stopwords.append(no_puncts) 
    
    return no_stopwords

In [10]:
no_stopwords = stopword_removal(normalised)

## Removal of Low and High Frequency Words

In [11]:
def count_words(texts):
    texts = tokenisation(texts)
    count_words = Counter([i for sublist in texts for i in sublist])
    return count_words

def print_vocab_size(texts):
    print("Total Vocabulary Size: " + str(len(texts)))

In [12]:
# compute vocabulary size
count_words_pre_cleaning = count_words(no_stopwords)
print_vocab_size(count_words_pre_cleaning)

Total Vocabulary Size: 14781


In [13]:
# compute word frequencies
dict_df = pd.DataFrame.from_dict(count_words_pre_cleaning, orient = 'index', columns = ['freq'])
dict_df['perc'] = (dict_df['freq'] / dict_df['freq'].sum()) * 100
dict_df.sort_values('perc', ascending = False).head(15).round(2)

Unnamed: 0,freq,perc
people,6275,1.36
ve,5192,1.12
work,4081,0.88
country,3472,0.75
know,3189,0.69
want,3060,0.66
year,2977,0.64
time,2973,0.64
world,2903,0.63
think,2762,0.6


In [14]:
# add low and high frequency words to extension list
min_freq = 15; max_freq = 1800
extension = dict_df[(dict_df.freq <= min_freq) | (dict_df.freq >= max_freq)].index.tolist()

# remove stopwords
final_clean = stopword_removal(normalised)

# compute vocabulary size
count_words_post_cleaning = count_words(final_clean)
print_vocab_size(count_words_post_cleaning)

Total Vocabulary Size: 3198


In [15]:
final_clean[0]

'behalf great state crossroad land let express deep gratitude privilege address convention tonight particular honor let face presence stage pretty unlikely father foreign student bear raise small village grow school father grandfather cook domestic british grandfather large dream son hard perseverance father scholarship study place shine beacon freedom opportunity study father meet mother bear town father oil rig farm depression day grandfather sign duty join march home grandmother raise baby line war study bill buy later west search opportunity big dream daughter common dream bear continent parent share love share abide faith possibility african barack bless believe barrier success imagine imagine school land weren rich generous rich achieve potential pass night look great pride stand stand grateful diversity heritage aware parent dream live precious daughter stand story large story owe debt earth story possible tonight gather affirm greatness height power military size economy pride 

In [16]:
# save cleaned data as txt file
with open("clean_obama_speeches.txt", "w") as file:
    for text in final_clean:
        file.write("%s\n" % text) # add line break after each speech to keep speeches separate
    file.close()