In [148]:
import numpy as np
import pandas as pd

## Sentiment weightning w.r.t. of the number of occurences of the speaker

In [150]:
#Path to data
path_us = 'data/'
dfs = []

cols_of_interest = ['quoteID', 'speaker', 'sentiment']

#Read the dataset for each year and merge them to "df_full"
for i in range(2015, 2021):
  df = pd.read_pickle(path_us + "us_" + str(i) + ".pkl.bz2",compression='bz2')[cols_of_interest]
  df['year'] = i
  dfs.append(df.copy())
  df = None
  print(i)

df_full = pd.concat(dfs)

2015
2016
2017
2018
2019
2020


In [151]:
print(len(df))
print(len(df_full))

343004
4230846


In [153]:
df.columns

Index(['quoteID', 'speaker', 'sentiment', 'year'], dtype='object')

In [154]:
'''
Format string to avoid to map same entity to two different entities
'''
#Static rules after analyzing the dataset
pres_dict_sur = {
    'george w. bush' : 'george bush',
    'george h.w. bush' : 'george bush',
    'donald j. trump' : 'donald trump',
    'donald trump jr.' : 'donald trump',
    'nicolás maduro guerra' : 'nicolas maduro',
    'nicolás maduro' : 'nicolas maduro'
    
}
pres_dict = {
  "trump": "donald trump",
  "obama": "barack obama",
  "clinton": "bill clinton",
"bush": 'george w. bush',
    'carter': 'jimmy carter',
    'moon' : 'moon jae-in',
    'maduro' : 'nicolas maduro'
  
}
'''
Format the speaker string to get a unique value foreach speaker:
ex : Donald Trump and President Donald J. Trump map to the same value : donald trump
'''
def format_string(s):
    if s == None:
        return None
    a = s.lower()
    if ('president ' in a):
        a = a.replace('president ','')
        a = pres_dict.get(a, a)
       
    return pres_dict_sur.get(a, a)

In [155]:
#Format the speaker string
df_temp = df_full.speaker.map(lambda x : format_string(x))

df_speak = df_full.copy()
df_speak['speaker'] = df_temp
df_speak

Unnamed: 0,quoteID,speaker,sentiment,year
0,2015-06-18-018819,chris matthews,-0.228475,2015
1,2015-10-25-000242,bernie sanders,0.798322,2015
2,2015-06-26-015863,gary peters,-0.941794,2015
3,2015-06-12-017391,rachel dolezal,-0.978019,2015
4,2015-10-23-001330,michelle goldberg,0.283719,2015
...,...,...,...,...
60889,2020-01-09-095402,donald trump,0.277540,2020
60892,2020-01-24-095953,none,-0.606498,2020
60893,2020-02-20-082429,none,-0.804123,2020
60894,2020-02-24-068286,donald trump,0.996957,2020


In [156]:
#Groupby speaker and count the number of occurence of each
count_occ = df_speak.groupby('speaker').size()

#Create a dataframe for the speaker occurence
df_count_occ = count_occ.to_frame().reset_index().rename(columns={0: "speaker_occ"})
df_count_occ.dtypes

speaker        object
speaker_occ     int64
dtype: object

In [157]:
df_count_occ[df_count_occ['speaker'] == 'donald trump']

Unnamed: 0,speaker,speaker_occ
53762,donald trump,119110


In [158]:
#Add to the speaker dataframe the number of occurence of the speaker
df_speaker_occ = df_speak.set_index('speaker').join(df_count_occ.set_index('speaker'), on='speaker').reset_index()

df_modif = df_full.copy()
df_modif['speaker_occ'] = 1
df_modif['speaker'] = df_full.speaker.map(lambda x : format_string(x))
final_df = pd.concat([df_speaker_occ,df_modif]).drop_duplicates('quoteID')

#Weight the sentiment w.r.t. to the number of occurence of the speaker
final_df['weighted_sentiment'] = final_df['sentiment']/final_df['speaker_occ']

In [162]:
# Undo the weighting for the None speaker
none_ind = final_df['speaker'] == 'none'
final_df.loc[none_ind, 'weighted_sentiment'] = final_df[none_ind]['sentiment']

In [166]:
# Store the results
for i in range(2015, 2021):
    df_year = final_df[final_df["year"] == i]
    name = "data/us_weighted_sent_" + str(i) + 'pkl.bz2'
    df_to_store = df_year[["quoteID", 'speaker', "speaker_occ", 'weighted_sentiment']]
    #Add new column containing the formatted name of the speaker
    df_to_store = df_to_store.rename(columns={"speaker": "speaker_l"})
    
    df_to_store.to_pickle(name, compression='bz2')