In [None]:
# if you dont have chartmodels pip install it (uncomment next line of code)
#!pip install chart_studio
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np
import pandas as pd
from helpers import *
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import chart_studio
import plotly.express as px
import chart_studio.plotly as py
import chart_studio.tools as tls

In [37]:
def send_figure_to_internet(fig,filename):
    """uploads a figure to pyplot express api service in order to include them in our website"""
    username = 'ogim'
    api_key = 'z9ZRECEZuhRpmGQjGcMX' # your api key - go to profile > settings > regenerate key
    chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
    url = py.plot(fig, filename = filename, auto_open=True)
    print(url)
    print('link:')
    print(tls.get_embed(url)) #change to your url

In [3]:
# data import
file = QUOTES_LABELED_CLEANED_PREDICTED
data = pd.read_json(file, lines=True, compression='bz2')
display(data.head(5))

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use,prob_dem
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...,train,0.167571
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...,test,0.250535
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...,train,0.791943
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...,train,0.901145
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...,test,0.980576
...,...,...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...,train,0.284423
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...,train,0.466911
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward,train,0.605841
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor,train,0.191033


In [4]:
#Loading of the pretrained model 
with open(MODEL_FINAL, 'rb') as file:
    clf_loaded = pickle.load(file)
#Loading of the pretrained vectorizer
with open(VECTORIZER_FINAL, 'rb') as file:
    vectorizer_loaded = pickle.load(file)
    
#Predicting function
def predict(quotes):
    y_proba = clf_loaded.predict_proba(vectorizer_loaded.transform(quotes))
    return y_proba[:,1] 

## How does a person's politicial opinions fluctuate over time?
We have assigned a score to each quotation, now we can see how the political opinion of a politician or a group of politicians varies over time.
#### Who is the influential Republican who has a score that is more polarized in the period (2015-2020)? 
To find out, we can fit a linear regression to the quotes scores and look at the slope of the time series.

In [45]:
#Just list of influential American politicians with theris Qids (from wikidata)
influential_R = ['Donald Trump','Arnold Schwarzenegger','Mike Pence','Ted Cruz','Sarah Palin','Mitch McConnel','Mitt Romney','Dick Cheney','Jeb Bush','Chris Christie','Rand Paul','Ben Carson','Henry Kissinger','Marco Rubio'] #source https://today.yougov.com/ratings/politics/fame/Republicans/all
id_inf_R = ['Q22686','Q2685','Q24313','Q2036942','Q43144','Q355522','Q4496','Q48259','Q221997','Q63879','Q463557','Q816459','Q66107','Q324546']
influential_D = ['Hilary Clinton','Barack Obama','Bernie Sanders','Bill Clinton','Joe Biden','Kamala Harris','Nancy Pelosi','Al Gore','Andrew Cuomo','Michael Bloomberg','Alexandria Ocasio-Cortez','Charles Schumer']
id_inf_D = ['Q6294','Q76','Q359442','Q1124','Q6279','Q10853588','Q170581','Q19673','Q11673','Q607','Q55223040','Q380900']

In [23]:
def get_quotes_by_speaker_id(ids,path,min_length_quote = 0,keep_RD = False , keep_exclusives = True):
    """given a set of speaker ids, extract all quotes of these speakers from dataset"""
    DF_PATH = path
    result = []
    i = 0
    with pd.read_json(DF_PATH,lines = True, compression= 'bz2',chunksize= 1000000) as df_reader:
        for df in df_reader:
            if not keep_RD:
                df = df[df.party_label != 'RD']
            if not keep_exclusives:
                df = df[df.party_label == 'RD']
            if min_length_quote >0:
                df = df[df.quotation_clean.apply(lambda x: len(x.split())) >= min_length_quote]
            df = df[df.id.isin(ids)]
            if df.shape[0] > 0:
                result.append(df)
            print(i, end='  ')
            i+=1
    return pd.concat(result)

In [48]:
# Create a dataframe with the information of influential politicians usefull for plotting
ls = (id_inf_R+id_inf_D)
#input file path
path = QUOTES_LABELED_CLEANED_PREDICTED
#retrieve data
df = get_quotes_by_speaker_id(ls,path,0,True,True)

0  1  2  3  4  5  6  7  

In [49]:
names = influential_D+influential_R
ids = id_inf_D+id_inf_R
dictionary = dict(zip(ids,names))
df['speaker'] = df['id'].apply(lambda x: dictionary[x])
parties = ['D' for i in range(len(id_inf_D))] + ['R' for i in range(len(influential_R))]

dictionary = dict(zip(ids,parties))
display(df)
# list with one df per politician
politiciansdfs = [df[df['speaker']== name] for name in names]
#list for storing resampled dfs of politicians
resampleddfs = []

#for each politician resample df and put it in listdf2
for dataf in politiciansdfs:
    #Save infos before resampling
    pid = dataf['id'].iloc[0]
    speaker = names[ids.index(pid)]
    #resample
    dataf = dataf[['date','prob_dem']].set_index('date').resample('3M').mean().reset_index()
    #Resetting infos after resampling
    dataf['speaker'] = speaker
    dataf['id'] = pid
    dataf['party'] = dataf['id'].apply(lambda x: dictionary[x])
    #append resampled df to list that will be used fot plotting
    resampleddfs.append(dataf)
df = pd.concat(resampleddfs)

{'Q6294': 'D', 'Q76': 'D', 'Q359442': 'D', 'Q1124': 'D', 'Q6279': 'D', 'Q10853588': 'D', 'Q170581': 'D', 'Q19673': 'D', 'Q11673': 'D', 'Q607': 'D', 'Q55223040': 'D', 'Q380900': 'D', 'Q22686': 'R', 'Q2685': 'R', 'Q24313': 'R', 'Q2036942': 'R', 'Q43144': 'R', 'Q355522': 'R', 'Q4496': 'R', 'Q48259': 'R', 'Q221997': 'R', 'Q63879': 'R', 'Q463557': 'R', 'Q816459': 'R', 'Q66107': 'R', 'Q324546': 'R'}


Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use,prob_dem
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...,test,0.972383
5,2015-10-15-015673,But if you talk about why the middle class is ...,Bernie Sanders,2015-10-15 10:07:01,Q359442,D,talk middle class disappearing almost income w...,train,0.999552
6,2015-04-15-038299,"If they destroy you, who is there left?",Bernie Sanders,2015-04-15 17:49:00,Q359442,D,destroy left,none,0.471059
7,2015-07-30-060129,It would make everybody in America poorer.,Bernie Sanders,2015-07-30 01:49:15,Q359442,D,would make everybody america poorer,train,0.993584
8,2015-10-14-073757,"labor is the source of all wealth,",Bernie Sanders,2015-10-14 20:09:23,Q359442,D,labor source wealth,none,0.270337
...,...,...,...,...,...,...,...,...,...
7089847,2020-01-31-016764,Did you see it as a request for a political fa...,Rand Paul,2020-01-31 19:48:31,Q463557,R,see request political favor coming election so...,train,0.042876
7089848,2020-01-07-066219,The Iranians will not be able to approach us o...,Rand Paul,2020-01-07 13:00:55,Q463557,R,iranian able approach u diplomacy revenge adeq...,test,0.017127
7090699,2020-03-05-051586,our past has set a framework which we must tra...,Henry Kissinger,2020-03-05 10:10:36,Q66107,R,past set framework must transcend fate past in...,train,0.068452
7090735,2020-03-05-014789,"everybody to be taken care of,",Ben Carson,2020-03-05 23:06:34,Q816459,R,everybody taken care,none,0.257053


Unnamed: 0,date,prob_dem,speaker,id,party
0,2015-01-31,0.777486,Hilary Clinton,Q6294,D
1,2015-04-30,0.821875,Hilary Clinton,Q6294,D
2,2015-07-31,0.851274,Hilary Clinton,Q6294,D
3,2015-10-31,0.845122,Hilary Clinton,Q6294,D
4,2016-01-31,0.848509,Hilary Clinton,Q6294,D
...,...,...,...,...,...
17,2019-04-30,0.157306,Marco Rubio,Q324546,R
18,2019-07-31,0.177612,Marco Rubio,Q324546,R
19,2019-10-31,0.160227,Marco Rubio,Q324546,R
20,2020-01-31,0.165417,Marco Rubio,Q324546,R


In [50]:
#plotting
fig = px.line(df, x="date", y= 'prob_dem',hover_data=['speaker'],line_group = 'speaker',color='party',title='Influential politicians timeseries and our model prediction',labels = {'prob_dem' :'Political score'})
fig.add_hline(y = 0.5, line_dash = 'dash',annotation_text = 'neutral')
fig.show()

In [51]:
send_figure_to_internet(fig,'general_trend')

https://plotly.com/~ogim/19/
link:
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plotly.com/~ogim/19.embed" height="525" width="100%"></iframe>


In [62]:
#prepare data for better timeserie plotting 
def treat_df_timeseries_plot(df):
    # predict and add column
    df['proba_dem'] = predict(df.quotation_clean)
    # drop unecessary columns
    df = df[['date','proba_dem']]
    # Resample with a given frequency and add a numerical line (otherwise no linear regression =( )
    df = df.set_index('date')
    df = df.resample('1M').mean()
    df['monthsfrom'] = (df.index - df.index[0]).days/30
    df = df.reset_index()
    return df
    

dfls = []
#for every politician
for i in range(len(names)):
#do a single df resampled and ready for plotting+
    name = names[i]
    tempdf = data[data['id'] == ids[i]]
    if not tempdf.empty:
        tempdf = treat_df_timeseries_plot(tempdf)
        tempdf['speaker'] = name
        #append the df to the list
        dfls.append(tempdf)
        
#concat the single politicians resampled dfs: now it's ready for plotting
finaldf = pd.concat(dfls)    

In [63]:
#finaldf['proba_dem'] = 1 - finaldf['proba_dem']
finaldf = finaldf.rename(columns={"monthsfrom": "months"})

In [64]:
#plotting 
fig = px.scatter(finaldf,x='months',y='proba_dem',color = 'speaker',title='Change of political score for the 20 most influential politician and with linear regressions', trendline="ols",labels = {'proba_dem' :'Political score','months' :'time'})
fig.update_xaxes(tickvals = finaldf.date,ticktext = finaldf.date,)
display(fig)

#optional presenting statistical results of the statsmodels linear regressions (equivalent to model.summary)
results = px.get_trendline_results(fig)
print(results)

                     speaker  \
0               Donald Trump   
1      Arnold Schwarzenegger   
2                 Mike Pence   
3                   Ted Cruz   
4                Sarah Palin   
5             Mitch McConnel   
6                Mitt Romney   
7                Dick Cheney   
8                   Jeb Bush   
9             Chris Christie   
10                 Rand Paul   
11                Ben Carson   
12           Henry Kissinger   
13               Marco Rubio   
14            Hilary Clinton   
15              Barack Obama   
16            Bernie Sanders   
17              Bill Clinton   
18                 Joe Biden   
19             Kamala Harris   
20              Nancy Pelosi   
21                   Al Gore   
22              Andrew Cuomo   
23         Michael Bloomberg   
24  Alexandria Ocasio-Cortez   
25           Charles Schumer   

                                       px_fit_results  
0   <statsmodels.regression.linear_model.Regressio...  
1   <statsmodels.regres

In [65]:
send_figure_to_internet(fig,'top20trend')

https://plotly.com/~ogim/14/
link:
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plotly.com/~ogim/14.embed" height="525" width="100%"></iframe>


What are the overall parties trends?

In [66]:
# create datatframes: Republicans vs Democrats
rep = data[data['party_label'] == 'R']
dem = data[data['party_label'] == 'D']
dfs = [rep,dem]

In [67]:
dfls = []
republican = True
#Same procedure as before to do a single df resampled and ready for plotting
for dataframe in dfs:
    dataframe = treat_df_timeseries_plot(dataframe)
    #relabel resampled data
    if republican:
        dataframe['party'] = 'R'
        republican = False
    else:
        dataframe['party'] = 'D'
    #append resampled dataframe
    dfls.append(dataframe)
    
#concat the two resampled dataframes
finaldf = pd.concat(dfls)    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [84]:
#Fixing params for the plot
finaldf = finaldf.rename(columns={"monthsfrom": "months"})
finaldf['proba_dem'] = 1 -finaldf['proba_dem']

In [87]:
#plotting 
color_code = ["red", "blue"] if finaldf.party.values[0] == 'R' else [ "blue","red"]
fig = px.scatter(finaldf,x='months',y='proba_dem',color = 'party',title='Overall political score trend over time: we observe that the parties avg scores are diverging  ', trendline="ols",labels = {'proba_dem' :'Political Score','months' :'time'},color_discrete_sequence=color_code)
fig.update_xaxes(tickvals = finaldf.date,ticktext = finaldf.date,)
display(fig)

#optional presenting statistical results of the statsmodels linear regressions (equivalent to model.summary)
# results = px.get_trendline_results(fig)
# print(results)

In [88]:
send_figure_to_internet(fig,'overallparties')

https://plotly.com/~ogim/16/
link:
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plotly.com/~ogim/16.embed" height="525" width="100%"></iframe>


As we have seen previously, our simple model is able to grasp the political inclination starting from the quotations. It is therefore evident to us that a company with important resources can easily develop a more sophisticated model, which is able, starting from the information (eg posts) we publish on social networks, to trace our political orientation. Therefore a political party can easily target the right people with advertisements so that they can influence the results of political elections as much as possible. Or worse still, a foreign body could influence another country's political elections. These machine learing models could make these phenomena economically feasible.

The final part of our analyses is in the this [notebook](part3_5-topic_and_model_evaluation.ipynb)