In [1]:
# set up 
import pandas as pd

# cleaning 
import re
import string 

#lemmatisation 
import spacy 

# n-grams 
import gensim
from gensim.models import Phrases 
from gensim.models import phrases
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS 

In [2]:
df = pd.read_excel("https://github.com/vtdinh13/testrepo/blob/master/obama_speeches.xlsx?raw=true", index_col = 0)[0]

In [3]:
def remove_document_formatting(text):
    # separate line breaks from words
    p1 = re.compile(r'(\n)([A-Za-z\\[])') 
    text = re.sub(p1, r"\1 \2", text) 
    p2 = re.compile(r'([A-Za-z])(\n)') 
    text = re.sub(p2, r"\1 \2", text)

    # remove footer
    text = re.sub('(AAm|AmericanRhetoric\.com)\s((.||\n)*?)\sPage\s\d{1,2}', '', text)
    text = re.sub('(meerriiccaannR)\s((.||\n)*?)\s(Property)', '', text)

    # remove everything up until (and including) the sentence with the date of the speech
    text = re.sub(r'^((.|\n)*)\s(\d{1,2}\s{1,2}[a-zA-Z]{3,9},?\s\d{4},?)\s.*\s\n', '', text)  

    # remove everything up until (and including) the statement about transcription
    text = re.sub('^((.|\n)*)\s(\\[?AUTHENTICITY)\s.*\s\n', '', text)  

    # remove line breaks
    text = re.sub("\n", "", text) 
    
    # remove multiple white spaces
    text = re.sub("\s+", " ", text)
    
    # lower case
    text = text.lower() 
    
    return text

# returns list of strings
def lemmatisation(text,  postags = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    nlp = spacy.load('en_core_web_sm', disable = ['parser'])        
    doc = nlp(text)
    lemmat_data = [token.lemma_ for token in doc if token.pos_ in postags]
    
    return lemmat_data


def clean_texts(texts):
    final_texts = []
    remove_sw_from_list = {"bill"}
    stopwords = STOPWORDS.difference(remove_sw_from_list)
    
    for text in texts:
        
        # clean formatting 
        text = remove_document_formatting(text)
        
        
        #lemmatization, make back into text from list
        text = " ".join(lemmatisation(text))
        
        
        # remove stop words
        no_stops_text = [word for word in text.split() if word not in stopwords]
        no_stops_text = " ".join( no_stops_text)
        
        # remove punctuation 
        no_puncts = re.sub("[^a-zA-Z\d\s/]", "", no_stops_text)
        
        final_texts.append(no_puncts)       
        
    return  final_texts

In [4]:
def prepare_n_grams(texts):
    n_grams_list = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        n_grams_list.append(new)
    return n_grams_list

def find_n_grams(texts):
    new_data = prepare_n_grams(texts)
    bigram_phrases = Phrases(new_data, min_count = 5, threshold = 120, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)

    trigram_phrases = Phrases(bigram_phrases[new_data], threshold = 120, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)
    
    bigram = phrases.Phraser(bigram_phrases)
    
    trigram = phrases.Phraser(trigram_phrases)
    
    ngrams = [bigram[text] for text in new_data]
    ngrams = [trigram[bigram[text]] for text in new_data]
    
    cleaned_list = [" ".join(wordlist) for wordlist in ngrams]
    return cleaned_list

In [6]:
clean = clean_texts(df)
print(clean[0])

behalf great state illinois crossroad nation land lincoln let express deep gratitude privilege address convention tonight particular honor let face presence stage pretty unlikely father foreign student bear raise small village kenya grow herd goat school tin roof shack father grandfather cook domestic servant british grandfather large dream son hard work perseverance father scholarship study magical place america shine beacon freedom opportunity come study father meet mother bear town world kansas father work oil rig farm depression day pearl harbor grandfather sign duty join patton army march europe home grandmother raise baby work bomber assembly line war study gi  bill buy house fha later west way hawaii search opportunity big dream daughter common dream bear continent parent share improbable love share abide faith possibility nation african barack bless believe tolerant america barrier success imagine imagine good school land rich generous america rich achieve potential pass know n

In [7]:
final_clean = find_n_grams(clean)
final_clean[0]

'behalf great state illinois crossroad nation land lincoln let express deep gratitude privilege address convention tonight particular honor let face presence stage pretty unlikely father foreign student bear raise small village kenya grow herd goat school tin roof shack father grandfather cook domestic servant british grandfather large dream son hard work perseverance father scholarship study magical place america shine beacon freedom opportunity come study father meet mother bear town world kansas father work oil rig farm depression day pearl_harbor grandfather sign duty join patton_army march europe home grandmother raise baby work bomber_assembly_line war study gi_bill buy house fha later west way hawaii search opportunity big dream daughter common dream bear continent parent share improbable love share abide faith possibility nation african barack bless believe tolerant america barrier success imagine imagine good school land rich generous america rich achieve potential pass know n

In [8]:
with open("clean_obama_speeches.txt", "w") as file:
    file.writelines(final_clean)
    file.close()