# Setup

In [None]:
# This code chunk can be removed in the final notebook since the requirements.txt file lists all used packages!
!pip install requests openpyxl PyMuPDF glob2 nltk spacy pandas gensim Counter matplotlib seaborn wordcloud

In [None]:
!python3 -m spacy download en_core_web_sm
nltk.download('omw-1.4')
nltk.download('wordnet')

In [None]:
# accessing APIs and URLs
import requests

# static web scraping
from urllib.request import urlopen
from lxml.html import parse, fromstring

# regular expressions
import re

# downloading files
import urllib.request

# operating system
import os

# looping through folder
import glob

# reading pdfs
import fitz 

# disabling warnings
import warnings
warnings.filterwarnings('ignore')

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# nlp
import spacy

# data wrangling
import pandas as pd

# topic modelling
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# counting words
from collections import Counter

# data viz
import matplotlib.pyplot as plt

# grahps
import seaborn as sns

# word cloud
from wordcloud import WordCloud

# centering plots
from IPython.core.display import HTML

In [None]:
# center plots
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

# Data Collection and Import

In [None]:
# extract all links from website
tree = parse(urlopen("https://www.americanrhetoric.com/barackobamaspeeches.htm"))
linkelements = tree.xpath("//a")
list_links = [e.attrib["href"] for e in linkelements]

In [None]:
print("Number of links: " + str(len(list_links)))

In [None]:
# only retain pdf links
p = re.compile('.*pdf$')
pdf_links = [ s for s in list_links if p.match(s) ]

In [None]:
print("Number of pdf links: " + str(len(pdf_links)))

In [None]:
# concetenate baseurl and path
baseurl = "https://www.americanrhetoric.com/"
full_pdf_links = [baseurl + link for link in pdf_links]

In [None]:
# create new folder
folder_name = "obama_speeches"
os.makedirs(folder_name)

In [None]:
# download all pdf files
def download_files(links, folder):
    i = 1
    for link in links:
        x = folder + "/file_" + str(i) + ".pdf"
        urllib.request.urlretrieve(link, x)
        i = i + 1

In [None]:
download_files(full_pdf_links, folder_name)

In [None]:
# import files
def import_pdfs(folder):
    
    # sort pdf files by name
    numbers = re.compile(r'(\d+)')
    def numericalSort(value):
        parts = numbers.split(value)
        parts[1::2] = map(int, parts[1::2])
        return parts
    filename_list = sorted(glob.glob(folder + "/*.pdf"), key = numericalSort)
    
    # create empty list
    speech = []
    
    # loop through all files
    for filename in filename_list:
        with fitz.open(filename) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        speech.append(text)
        
    return speech

In [None]:
list_speeches = import_pdfs(folder_name)

# Data Pre-Processing

## Noise Removal

In [None]:
# noise removal and standardisation
def noise_removal(texts):
    no_noise = []
    for text in texts:
        
        # separate line breaks from words
        p1 = re.compile(r'(\n)([A-Za-z\\[])') 
        text = re.sub(p1, r"\1 \2", text) 
        p2 = re.compile(r'([A-Za-z])(\n)') 
        text = re.sub(p2, r"\1 \2", text)
        
        # remove footer
        text = re.sub('(AAm|AmericanRhetoric\.com)\s((.||\n)*?)\sPage\s\d{1,2}', '', text)
        text = re.sub('(meerriiccaannR)\s((.||\n)*?)\s(Property)', '', text)
        
        # remove everything up until (and including) the sentence with the date of the speech
        text = re.sub(r'^((.|\n)*)\s(\d{1,2}\s{1,2}[a-zA-Z]{3,9},?\s\d{4},?)\s.*\s\n', '', text)  
        
        # remove everything up until (and including) the statement about transcription
        text = re.sub('^((.|\n)*)\s(\\[?AUTHENTICITY)\s.*\s\n', '', text)  
        
        # remove line breaks
        text = re.sub("\n", "", text) 
        
        # remove multiple white spaces
        text = re.sub("\s+", " ", text) 
        
        # lower case
        text = text.lower() 
        
        # remove punctuation and most special characters
        text = re.sub("[^a-zA-Z\d\s/]", "", text)
        no_noise.append(text)
        
    return no_noise

In [None]:
no_noise = noise_removal(list_speeches)

In [None]:
# stopword removal
stop_words = stopwords.words("english")
no_stopwords = [[word for word in word_tokenize(text) if word not in stop_words] for text in no_noise]

## N-Grams

In [None]:
#create n-grams
def n_grams(texts, min_count, threshold):
    
    # setup
    bigram = gensim.models.Phrases(texts, min_count = min_count, threshold = threshold)
    trigram = gensim.models.Phrases(bigram[texts], threshold = threshold)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # add bigrams
    n_grams = [bigram_mod[doc] for doc in texts]

    # add trigrams
    n_grams = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    return n_grams

In [None]:
bigrams_trigrams = n_grams(no_stopwords, 5, 100)

## Text Normalisation

In [None]:
# text normalisation with pos tags
def text_normalisation(texts):
    nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
    postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    normalised_data = []

    for word in texts:
            doc = nlp(" ".join(word)) 
            normalised_data.append([token.lemma_ for token in doc if token.pos_ in postags])
    
    return normalised_data

In [None]:
normalised_data = text_normalisation(bigrams_trigrams)

## Removal of Low and High Frequency Words

In [None]:
# compute vocabulary size
def count_words(texts):
    count_words = Counter([i for sublist in texts for i in sublist])
    return count_words

def print_vocab_size(texts):
    print("Total Vocabulary Size: " + str(len(texts)))

In [None]:
count_words_pre_cleaning = count_words(normalised_data)
print_vocab_size(count_words_pre_cleaning)

In [None]:
# compute word frequencies
dict_df = pd.DataFrame.from_dict(count_words_pre_cleaning, orient = 'index', columns = ['freq'])
dict_df['perc'] = (dict_df['freq'] / dict_df['freq'].sum()) * 100
dict_df.sort_values('perc', ascending = False).head(15).round(2)

In [None]:
# add low and high frequency words to stop_words
min_freq = 20; max_freq = 2300
extension = dict_df[(dict_df.freq <= min_freq) | (dict_df.freq >= max_freq)].index.tolist()
extension = extension + ["lot", "thing", "let", "use", "sure", "look", "tell", "many", "much", "thank"]
stop_words.extend(extension)

# remove stopwords
stop_words_extended = set(stop_words)
cleaned_data = [[token for token in text if token not in stop_words_extended] for text in normalised_data]

# compute vocabulary size
count_words_post_cleaning = count_words(cleaned_data)
print_vocab_size(count_words_post_cleaning)

In [None]:
# create term document frequency
id2word = corpora.Dictionary(cleaned_data)
corpus = [id2word.doc2bow(text) for text in cleaned_data]

# Topic Modelling

## Determining the Number of Topics

In [None]:
# 'elbow' method
def coherence_values(corpus, id2word, start, limit, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus = corpus, id2word = id2word, num_topics = num_topics, 
                                          random_state = 100, update_every = 1, chunksize = 100, 
                                          passes = 10, alpha = 'auto', per_word_topics = True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model = model, texts = cleaned_data, dictionary = id2word, 
                                        coherence = 'c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return coherence_values

In [None]:
start = 2; limit = 20; step = 2
coherence_values = coherence_values(corpus, id2word, start, limit, step)

In [None]:
# plot results
plt.rcParams['figure.figsize'] = [14, 7]
x = range(start, limit, step)
plt.plot(x, coherence_values, label = "Coherence Values")
plt.xlabel("Number of Topics", fontsize = 20)
plt.ylabel("Coherence Values", fontsize = 20)
plt.legend(loc="upper right", fontsize = 15)
plt.title('Elbow Method', fontsize = 25, pad = 20)
plt.show()

In [None]:
# examine coherence values
for number_topics, cv in zip(x, coherence_values):
    print(number_topics, 'topics have a coherence value of', round(cv, 3))

Based on the coherence values, 8 topics seem to be the best choice for our data. However, a coherence value of 0.462 is rather low, so maybe more pre-processing is needed?

## Latent Dirichlet Allocation

In [None]:
# build topic model
number_topics = 8
topic_model_lda = gensim.models.ldamodel.LdaModel(corpus = corpus, id2word = id2word, num_topics = number_topics, 
                                                  random_state = 100, update_every = 1, chunksize = 100, 
                                                  passes = 14, alpha = 'auto', per_word_topics = True)

In [None]:
# compute perplexity
print('Perplexity: ', round(topic_model_lda.log_perplexity(corpus), 3))

# compute coherence value
coherence_topic_model_lda = CoherenceModel(model = topic_model_lda, texts = cleaned_data, dictionary = id2word, 
                                           coherence = 'c_v')
coherence_topic_model_lda_values = coherence_topic_model_lda.get_coherence()
print('Coherence Value: ', round(coherence_topic_model_lda_values, 3))

interpretation??

In [None]:
# create data frame of topics with corresponding keywords
lda_topics = [[(term, round(weight, 3)) for term, weight in topic_model_lda.show_topic(n, topn = 20)] 
              for n in range(0, topic_model_lda.num_topics)]
lda_topics_df = pd.DataFrame([', '.join([term for term, weight in topic]) 
                              for topic in lda_topics], columns = ['keywords'],
                             index = ['topic_'+str(t) for t in range(1, topic_model_lda.num_topics + 1)] )

In [None]:
# replace generic index with topic names
index_names = lda_topics_df.index.values.tolist()
topic_names = ["defence", "national pride", "labour market", "health care", "financial sector", 
               "support system", "future", "political ambitions"]
lda_topics_df = lda_topics_df.rename(index = dict(zip(index_names, topic_names)))

In [None]:
# print data frame
pd.set_option('display.max_colwidth', 0)
lda_topics_df

In [None]:
# reset display settings
pd.reset_option('^display.', silent = True)

In [None]:
# plot settings
wc = WordCloud(background_color = "white", colormap = "tab10",
               max_font_size = 150, random_state = 42)
plt.rcParams['figure.figsize'] = [20, 8]

# create wordcloud for each topic
for i in range(topic_model_lda.num_topics):
    wc.generate(text = lda_topics_df["keywords"][i])
    plt.subplot(2, 4, i + 1)
    plt.imshow(wc, interpolation = "bilinear")
    plt.axis("off")
    plt.title(lda_topics_df.index[i], fontsize = 22, y = 1.2)

## Topic Distribution by Years

In [None]:
# extract distribution of topics by speech
topic_distribution_speeches = [topic_model_lda.get_document_topics(item, 
                                                                   minimum_probability = 0.0) for item in corpus]

In [None]:
# create data frame of topic distributions by speech
lda_df_proportions = pd.DataFrame.from_records([{v: k for v, k in row} for row in topic_distribution_speeches])
lda_df_proportions.columns = topic_names
lda_df_proportions['file'] = lda_df_proportions.reset_index().index + 1
lda_df_proportions = lda_df_proportions.set_index('file')

In [None]:
# create data frame of speeches
d = {'speech': list_speeches}
speeches_df = pd.DataFrame(d)

# add file name
speeches_df['file'] = speeches_df.reset_index().index + 1
speeches_df = speeches_df.set_index('file')

# extract dates of speeches
date_reg = r'(\d{1,2}\s{1,2}[a-zA-Z]{3,9},?\s?\d{4}?)'
speeches_df['date'] = speeches_df['speech'].str.extract(date_reg, expand = False)

# manually fix dates that were not picked up by regex
speeches_df.at[271, 'date'] = '2014-07-18'
speeches_df.at[329, 'date'] = '2015-07-15'
speeches_df.at[377, 'date'] = '2016-02-26'
speeches_df['date'] = pd.to_datetime(speeches_df['date'])

# drop speech column
speeches_df = speeches_df.drop('speech', 1)

In [None]:
# merge both data frames
df_merged = pd.merge(lda_df_proportions, speeches_df, on = 'file')

In [None]:
# extract year of speeches
df_merged['year'] = df_merged['date'].dt.year.convert_dtypes()

# average topic distribution per year
topic_distribution_df = df_merged.groupby('year', as_index = False)[topic_names].mean().copy()

# transform data frame from wide to long format
topic_distribution_df_melt = topic_distribution_df.melt(id_vars = 'year', value_vars = topic_names, 
                                                        var_name = 'topic', value_name = 'prevelance')

In [None]:
# plot topic distribution by year
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [14, 7]
topics_by_year = sns.lineplot(data = topic_distribution_df_melt, x = "year", y = "prevelance", hue = "topic",
                              linewidth = 2.5)
topics_by_year.set_xlabel("Year", fontsize = 20)
topics_by_year.set_ylabel("Prevalence", fontsize = 20)
plt.legend(ncol = 2, loc = 'upper center', fontsize = 13, title = "Topics", title_fontsize = 15, markerscale = 1.5)
plt.title('Topic Distribution by Year', fontsize = 25, pad = 20)
plt.show()

# Topic Prediction

## Predictors

In [None]:
# ADD CAPTIONS

from pprint import pprint
from gensim.models import CoherenceModel
from wordcloud import WordCloud
from collections import Counter
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import pyLDAvis
import pyLDAvis.gensim_models as LDAgensim


In [None]:
# We construct our LDA model

warnings.filterwarnings('ignore')
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics = 6, 
                                            random_state = 100, update_every = 1, chunksize = 100, passes = 14, alpha = 'auto', per_word_topics=True) # Here we selected 5 topics
pprint(model.print_topics())
model_cor = model[corpus]

In [None]:
# Now we calculate coherence score and perplexity

model_coher = CoherenceModel(model=model, texts=cleaned_data, dictionary=id2word, coherence='c_v')
coher_s = model_coher.get_coherence()
print('Coherence Score: ', coherence_lda)
print('Perplexity: ', model.log_perplexity(corpus))

To find the most important words for each topic, we first find the dominant topics by taking the distribution of the topics per document

In [None]:
topic_dist = [model.get_document_topics(item, minimum_probability=0.0) for item in corpus]
topic_dist

In [None]:
top_cor = [sorted(topics, key=lambda record: -record[1])[0] for topics in topic_dist]
top_cor

In [None]:
model.num_topics

In [None]:
topics = [[(term, round(wt, 3)) for term, wt in model.show_topic(n, topn=20)] for n in range(0, model.num_topics)]
topics

Next, we construct a dataframe matrix for the topics and keywords

In [None]:
topics_mat = pd.DataFrame([[term for term, wt in topic] for topic in topics], columns = ['Keyword '+str(i) for i in range(1, 21)],
                         index=['Topic '+str(t) for t in range(1, model.num_topics+1)]).T
topics_mat.head()

The keywords per topic are now viewable

In [None]:
pd.set_option('display.max_colwidth', -1)

topics_mat = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics], columns = ['Topic Keywords'],
                         index=['Topic'+str(t) for t in range(1, model.num_topics+1)] )
topics_mat

### Wordcloud

We construct a wordcloud for our LDA model from the keywords for each topic

In [None]:
cloud = WordCloud(background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)
plt.rcParams['figure.figsize'] = [20, 10]

# We also construct subplots per topic
for i in range(model.num_topics): # this is how many topics we show the wordclouds for

    cloud.generate(text=topics_mat["Topic Keywords"][i])
    
    plt.subplot(2, 3, i+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topics_mat.index[i])

plt.show()

### Word count and word weights or significance

In [None]:
# we create the data frame for the word count and keyword weights 
tops = model.show_topics(formatted=False)
flat_data = [w for w_list in cleaned_data for w in w_list]
counts = Counter(flat_data)

output = []
for i, topic in tops:
    for word, weight in topic:
        output.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'weights', 'word_count'])        

In [None]:
# now we plot the word count and the keyword weights
fig, axes = plt.subplots(2, 3, figsize=(16,10), sharey=True, dpi=160)
colors = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=colors[i], width=0.5, alpha=0.3, label='Word Count')
    axtwin = ax.twinx()
    axtwin.bar(x='word', height="weights", data=df.loc[df.topic_id==i, :], color=colors[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=colors[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=colors[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); axtwin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Weights of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

### We investigate the number of speeches corresponding to a topic

In [None]:
def speeches_per_topic (model, corpus, start=0, end=1):
    full_corpus = corpus[start:end]
    domtopics = []
    percentage_topic = []
    for i, corp in enumerate(full_corpus):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        percentage_topic.append(topic_percs)
    return(dominant_topics, percentage_topic)

domtopics, percentage_topic = topics_per_document(model=lda_model, corpus=corpus, end=-1)            

# Dominant Topics per speech
df = pd.DataFrame(domtopics, columns=['Document_Id', 'Dominant_Topic'])
speech_dom_top = df.groupby('Dominant_Topic').size()
df_speech_dom_top = speech_dom_top.to_frame(name='count').reset_index()

# Distribution of topics by weight
doc_weight = pd.DataFrame([dict(t) for t in percentage_topic])
df_doc_weight = doc_weight.sum().to_frame(name='count').reset_index()

# 3 main keywords per topic
keywords3 = [(i, topic) for i, topics in model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

stacked_df_keywords3 = pd.DataFrame(keywords3, columns=['topic_id', 'words'])
df_keywords3 = stacked_df_keywords3.groupby('topic_id').agg(', \n'.join)
df_keywords3.reset_index(level=0,inplace=True)

In [None]:
# Plot speeches per dominant topic
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), dpi=120, sharey=True)

# Dominant topic distribution
ax1.bar(x='Dominant_Topic', height='count', data=df_speech_dom_top, width=.5, color='firebrick')
ax1.set_xticks(range(df_speech_dom_top.Dominant_Topic.unique().__len__()))
formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_keywords3.loc[df_keywords3.topic_id==x, 'words'].values[0])
ax1.xaxis.set_major_formatter(formatter)
ax1.set_title('Speeches by dominant topic', fontdict=dict(size=10))
ax1.set_ylabel('Speeches')
ax1.set_ylim(0, 1000)

# Topic weights distribution
ax2.bar(x='index', height='count', data=df_doc_weight, width=.5, color='steelblue')
ax2.set_xticks(range(df_doc_weight.index.unique().__len__()))
ax2.xaxis.set_major_formatter(formatter)
ax2.set_title('Speeches by topic weights', fontdict=dict(size=10))
plt.show()

### LDA interactive visualization

Shows the topics and their keywords. 

In [None]:
pyLDAvis.enable_notebook()
interactive = gensimvis.prepare(model, corpus, id2word)
interactive

### Speech dominant topics

In [None]:
df_cor = pd.DataFrame()
df_cor['Dominant Topic'] = [item[0]+1 for item in df_cor]
df_cor['Contribution %'] = [round(item[1]*100, 2) for item in df_cor]
df_cor['Topic Terms'] = [lda_topics_df.iloc[t[0]]['Key_Words_per_Topic'] for t in df_cor]

df_cor.head(5)

### Percentages of dominant topics

In [None]:
dom_df = df_cor.groupby('Dominant Topic').agg(
                                  Doc_Count = ('Dominant Topic', np.size),
                                  Total_Docs_Perc = ('Dominant Topic', np.size)).reset_index()

dom_df['Total speech %'] = dom_df['Total speech %'].apply(lambda row: round((row*100) / len(corpus), 2))

dom_df.sort_values('Total speech %')

### LDA weights by topic

In [None]:
weights = pd.DataFrame.from_records([{v: k for v, k in row} for row in topic_dist])
weights.columns = ['Topic ' + str(i) for i in range(1,8)]
weights

In [None]:
df2 = speech.copy()

In [None]:
df2['year'].reset_index(drop=True)

In [None]:
weights['year'] = df2.year.reset_index(drop=True)

In [None]:
weights.head(5)

In [None]:
weights['prevalent'] = weights.drop('year', axis=1).idxmax(axis=1)

In [None]:
weights.head(8)

In [None]:
weights.groupby('year')['prevalent'].value_counts(normalize=True)

In [None]:
weight_dominance = weights.groupby('year')['prevalent'].value_counts(normalize=True).unstack().fillna(0)
weight_dominance

### LDA Topic Distribution from 2004 to 2016

In [None]:
weight_dom_year = weights.groupby('year')['prevalent'].value_counts(normalize=True).unstack().fillna(0).reset_index().copy()
weight_dom_year.head(5)

In [None]:
weight_melted_year = weight_dom_year.melt(id_vars= 'year' , value_vars=['Topic ' + str(i) for i in [1,2, 3, 4, 5, 6]], var_name='Topic', value_name='prevelance')
weight_melted_year

### ???

In [None]:
weight_melted_year.to_excel("/Users/fazek/OneDrive/Asztali gép/mda 2022/topic_trend.xlsx")

In [None]:
sns.set_style("whitegrid")
_ = sns.lineplot(data=df_melted_year, x="year", y="prevelance", hue="Topic", style="Topic", palette='Dark2')