In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq

## Adam Partie

Le but de cette tache est de construire un graphique montrant la fréquence des quotes par parti politique pour chacun des Topics définis.

In [76]:
#read QID csv
QID = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

#read parquet file
speaker_df = pd.read_parquet("speaker_attributes.parquet", engine='pyarrow')

#removing duplicates on 'label'
speaker_df=speaker_df.drop_duplicates(subset=['label'])

In [77]:
# We open the dataframe with each quote associated to a topic from the end of the LDA file
df = pd.read_pickle('quotes_topic_associated.pkl')

In [78]:
# rename in order to make next merge on speaker attribute
df = df.rename(columns={'speaker': 'label'})

In [79]:
df.head(1)

Unnamed: 0,quoteID,quotation,label,date,numOccurrences,class
5265,2015-08-19-056730,"[it, s, very, important, to, understand, how, ...",Helen McGregor,2015-08-19 11:46:00,2,2


In [80]:
# We merge our dataframe on the speaker with the speaker dataframe in order to retrive speaker's Political party
df_merged = pd.merge(df, speaker_df, how='left', on='label')

In [81]:
# Some speakers are associated to multiple parties, we expand them in order to have a single party per quote
df_merged = df_merged.dropna(subset=['party']).explode(column='party')

In [82]:
# We remove useless attributes
df_merged = df_merged.drop(['nationality', 'aliases', 'date_of_birth', 'gender', 'lastrevid', 'ethnic_group','US_congress_bio_ID', 'academic_degree', 'religion', 'date','type', 'candidacy' ], axis=1)

In [83]:
# Rename in order to merge with QID dataset
df_merged = df_merged.rename(columns={'party': 'QID'})


In [84]:
df_new = pd.merge(df_merged, QID, on='QID')


In [85]:
df_new.dropna(inplace=True)


In [86]:
# We keep only american parties
df_new = df_new[df_new['Description'].str.contains('United States')]


In [87]:
# Main Instruction : 
# Counting for each class (i.e 'Topic') the frequency (normalize=True) of quotes per political party
# and we store it in 'Frequency of quotes'
df_frequencies = df_new.groupby('class')['Label'].value_counts(normalize=True, sort=True, ascending=False).to_frame('Frequency of Quotes')

In [88]:
df_frequencies = df_frequencies.reset_index()

# Remove marginal parties : less than 1.5% 
df_frequencies = df_frequencies[df_frequencies.reset_index()['Frequency of Quotes']>0.015]

# Give meaningful names to attributes
df_frequencies = df_frequencies.rename(columns={'class': 'Topic','Label':'Party' })

# To recover the topic's numbers
df_frequencies['Topic'] = df_frequencies['Topic'] + 1

In [89]:
df_frequencies

Unnamed: 0,Topic,Party,Frequency of Quotes
0,1,Democratic Party,0.666667
1,1,Republican Party,0.333333
2,2,Republican Party,0.777778
3,2,Democratic Party,0.222222
4,3,Republican Party,0.809211
5,3,Democratic Party,0.184211
7,4,Democratic Party,0.827149
8,4,Republican Party,0.159276
11,5,Democratic Party,0.64
12,5,Republican Party,0.36


In [90]:
import plotly.express as px
fig = px.bar(df_frequencies, x="Party", y="Frequency of Quotes", color="Party", barmode="group",
             facet_col="Topic", width=900, height=500, title="Distribution of quotes by Party in each Topic")
fig.update_layout(
    title={
        'text': "Distribution of quotes by Party in each Topic",
        'y':0.9,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()
#fig.write_html("frequencies_per_topic.html")