# Milestone 2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, date, time
from dateutil.relativedelta import relativedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

##### Extracting quotes from politicians
In the following cell we load the provided metadata file, available [here](https://drive.google.com/drive/folders/1VAFHacZFh0oxSxilgNByb1nlNsqznUf0). This is used for obtaining a list of names of politicians, which we later use for filtering the QuoteBank dataset

In [None]:
df_meta = pd.read_parquet('speaker_attributes.parquet')
df_meta = df_meta[df_meta['occupation'].notna()]

# The identifier for the occupation politician in Wikidata is Q82955
politicians = df_meta[df_meta.occupation.apply(lambda x: 'Q82955' in x)]
politicians = politicians[politicians['aliases'].notna()]
politicians = politicians[politicians['label'].notna()]

# Make a combined list that contains both the aliases and the label names for the politicians
list_of_politicians = list(politicians.aliases.explode())
list_of_politicians += list(politicians.label)

We use the provided [Colab notebook](https://colab.research.google.com/drive/1NqLFrAWAzKxr2dAWHI7m6Ml3gWGF72cA?usp=sharing) for loading the QuoteBank dataset. This script creates a compressed file that is opened in the following cell

In [None]:
df_reader = pd.read_json('quotes-2020-domains.json.bz2', lines=True, compression='bz2', chunksize=10000)

df_politicians = pd.concat([
    chunk[chunk['speaker'].isin(list_of_politicians)]
    for chunk in df_reader])

##### Save to disk
Since it takes quite some time to load and filter the data, let's save it to a compressed .csv file that we can se.

In [None]:
compression_opts = dict(method='zip',archive_name='politicians.csv')  
df_politicians.to_csv('politicians.zip', index=False, compression=compression_opts)
compression_opts = dict(method='zip',archive_name='meta.csv')  
df_meta.to_csv('meta.zip', index=False, compression=compression_opts)

##### Read from disk

In [4]:
df_politicians = pd.read_csv("politicians.csv")
df_meta = pd.read_csv("meta.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Quote time correlation
##### Using quotes as signal
##### We retrieve all of Joe Biden's and Donald Trump's quotes for the year 2020 and the associated dates. We create a signal for each politician corresponding to the number of speeches recorded each week of the year. And finally we can visualise the correlation of the two signals but also the time lag between the two signals

In [None]:
df_politicians.date = pd.to_datetime(df_politicians.date)

In [None]:
signal_Joe_Biden = np.zeros([53,]) # 53 weeks in 2020 because of leap year
signal_Donald_Trump = np.zeros([53,]) # 53 weeks in 2020 because of leap year
signal_macron = np.zeros([53,]) # 53 weeks in 2020 because of leap year


biden_aliases = list(df_meta[df_meta['label'] == 'Joe Biden']['aliases'].values[0]) # We extract different ways to call him in the df

chunk_Joe_Biden = df_politicians[(df_politicians.speaker == "Joe Biden") | (df_politicians.speaker.isin(biden_aliases))] # We extract all his quotes
weeks_Joe_Biden = chunk_Joe_Biden.date.apply(lambda x: x.week).values # We ectract all the weeks of eachs quotes
signal_Joe_Biden, _ = np.histogram(weeks_Joe_Biden, range(1, signal_Joe_Biden.shape[0]+2)) # We create an histogram of number of quotes per weeks

# Same for Trump
trump_aliases = list(df_meta[df_meta['label'] == 'Donald Trump']['aliases'].values[0])

chunk_Donald_Trump = df_politicians[(df_politicians.speaker == "Donald Trump") | (df_politicians.speaker.isin(trump_aliases))] 
weeks_Donald_Trump = chunk_Donald_Trump.date.apply(lambda x: x.week).values
signal_Donald_Trump, _ = np.histogram(weeks_Donald_Trump, range(1, signal_Donald_Trump.shape[0]+2))

# Same for another politician
macron_aliases = list(df_meta[df_meta['label'] == 'Emmanuel Macron']['aliases'].values[0])

chunk_macron = df_politicians[(df_politicians.speaker == "Emmanuel Macron") | (df_politicians.speaker.isin(macron_aliases))] 
weeks_macron = chunk_macron.date.apply(lambda x: x.week).values
signal_macron, _ = np.histogram(weeks_macron, range(1, signal_macron.shape[0]+2))

##### Plot the results

In [None]:
plt.plot(signal_Joe_Biden, label="Joe Biden")
plt.plot(signal_Donald_Trump, label="Donald Trump")
plt.plot(signal_macron, label="Emmanuel Macron")
plt.xlim(1,signal_Donald_Trump.shape[0])
plt.xlabel("Weeks of 2020")
plt.ylabel("Number of speeches per week")
plt.title("Comparison of the number of speeches per week\nof Donald Trump and Joe Biden")
plt.legend();

In [None]:
np.correlate(signal_Donald_Trump/signal_Donald_Trump.sum(), signal_Joe_Biden/signal_Joe_Biden.sum())[0] # We anaylse the correlation between signals, normalized

In [None]:
np.correlate(signal_Donald_Trump/signal_Donald_Trump.sum(), signal_macron/signal_macron.sum())[0]

# Quote similarity
### In this section we will demonstrate how we can compute quote similarity using Tfidf. This will be demonstrated using quotes from Donald Trump and Joe Biden.

##### Extract and save Trump and Biden quote 

In [None]:
chunk_Donald_Trump.to_csv("Trump_2020_v2")
chunk_Joe_Biden.to_csv("Biden_2020_v2")

##### Read the csv saved of Trump and Biden

In [2]:
Trump = pd.read_csv("Trump_2020_v2")
Biden = pd.read_csv("Biden_2020_v2")

##### Learn the vocabulary from the quotes of all politcians in the data set 

In [5]:
corpus = df_politicians['quotation'].to_list()
vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words=['english'])
vectorizer = vectorizer.fit(corpus)

###### Pick a random quote from Donald Trump

In [6]:
random = 25
ref_Trump = Trump.loc[random]
trump_quote = [ref_Trump.quotation]

###### Extract quotes of Joe Biden within the following month after the quote by Donald Trump

In [7]:
ref_date = datetime.strptime(ref_Trump.date, '%Y-%m-%d %H:%M:%S')
end_date = ref_date + relativedelta(months=1)
Biden.date = Biden.date.apply(lambda d: datetime.strptime(d, '%Y-%m-%d %H:%M:%S'))
Biden_target = Biden[Biden.date>ref_date]
Biden_target = Biden_target[Biden_target.date<end_date]
Biden_target = Biden_target[["date", "quotation"]].sort_values(ascending=True, by="date")

##### Vectorize the quotes using Tfidf

In [9]:
biden_quotes = Biden_target.quotation.to_list()

trump_vec = vectorizer.transform(trump_quote)
biden_vec = vectorizer.transform(biden_quotes)

##### Rank the quotes of Joe Biden according to the cosine similarity to the quote by Donald Trump and select the most similar

In [10]:
similar_index = np.argmax(cosine_similarity(trump_vec, biden_vec))
print("Cosine similarity: %.3f" % cosine_similarity(trump_vec, biden_vec[similar_index])[0][0])
print("-------")
similar_quote = biden_quotes[similar_index]
print("Quote by Donald Trump: ",trump_quote[0])
print("-------")
print("Quote by Joe Biden: ",biden_quotes[similar_index])

Cosine similarity: 0.195
-------
Quote by Donald Trump:  In our discussions, Prime Minister Modi and I affirmed our two countries' commitment to protecting our citizens from radical Islamic terrorism. In this effort, the United States is also working productively with Pakistan to confront terrorists who operate on its soil.
-------
Quote by Joe Biden:  It is important to get through this crisis, protecting our public health and our democracy,
