In [42]:
import pandas as pd
import numpy as np
import bz2
import json
import findspark
findspark.init('/Users/tatianacogne/spark')
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from datetime import datetime
import plotly.express as px
from datetime import timedelta as dt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.options.mode.chained_assignment = None

#### Dict and DF useful for this notebook

In [2]:
from_qids_to_label_occupation = pd.read_json('data/occupations.json') 
from_qids_to_label_ethnics = pd.read_json('data/ethnic.json' ) 
from_qids_to_label_religion = pd.read_json('data/religion.json') 
from_qids_to_label_nationality = pd.read_json('data/nationality.json') 
from_qids_to_label_party = pd.read_json('data/party.json') 
from_qids_to_label_gender = pd.read_json('data/gender.json') 

df_speakers_attributes = pd.read_parquet('data/speaker_attributes.parquet')

In [54]:
id_musk = df_speakers_attributes[df_speakers_attributes.label =='Elon Musk'].id.values[0]

### Process for the speaker attributs

In [3]:
def map_qids_labels(list_qids,df_):
    """Transform a list of QIDS into a list of labels, return 0 if the list is empty (which is equal to 0 in the dataframe)."""
    if(type(list_qids)!=int):
        return [df_[df_.qids == x].values[0][1] for x in list_qids if x!=0 and len(df_[df_.qids == x].values)>=1]    
    else:
        return 0  

In [4]:
def process_speakers_attributes(df_speakers_attributes):
    """Add the columns for the party, occupation, religion, ethnics and nationality their labels. Take as input the dataframe with the selected speakers and return the same dataframe with the columns added."""
    df = df_speakers_attributes[['date_of_birth','nationality','gender','ethnic_group','occupation','id','label','religion','party']]
    df = df.rename(columns={'date_of_birth':'birth','nationality':'nationality_qids','gender':'gender_qids','ethnic_group':'ethnics_qids','occupation':'occupations_qids','id':'name_qids','label':'name_label','religion':'religion_qids','party':'party_qids','gender':'gender_qids'})
    # Add labels for occupations, ethincs, religion, nationality, party
    df = df.fillna(0)
    df['occupation_label']= df['occupations_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_occupation))
    df['ethnics_label']= df['ethnics_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_ethnics))
    df['religion_label']= df['religion_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_religion))
    df['nationality_label']= df['nationality_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_nationality))
    df['party_label']= df['party_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_party))
    df['gender_label']= df['gender_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_gender))
    return df
    

#### Select only the QIDS needed in the analysis

In [5]:
def select_speakers(list_qids):
    """From a list of QIDS of speakers, return the dataframe with only the speakers that are in the list."""
    return df_speakers_attributes[df_speakers_attributes.id.isin(list_qids)]

### Process for the quotes dataframe

In [35]:
def process_dataframe_quotes(df_sent, speakers):
    """ Transform the date into a datetime object ('%Y-%m-%d %H:%M:%S). Join the quotes dataframe with the speakers dataframe on the name_qids."""
    
    # Rename the columns and keep only the one needed
    df_1 = df_sent[['name_qids','quotation','quote_qids','review']]
    
    # Change timestamp in datetime object and add the perdio depending on the time_range list
    df_sent['date']= df_sent['date'].apply(lambda x :datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S'))
    df_sent['year_month']= df_sent['date'].apply(lambda x :datetime.strptime(str(x)[:7], '%Y-%m'))
    
    #Join the dataframe with the speakers attributes on the name_qids
    df = df_sent.merge(speakers, on='name_qids')
    return df
  

#### Chevrolet

In [43]:
df_chevrolet= pd.read_json('data/chevrolet_sentiment.json')

list_qids_chevrolet = df_chevrolet.drop_duplicates(['name_qids']).name_qids.values
df = select_speakers(list_qids_chevrolet)
df_speakers_att_chevrolet = process_speakers_attributes(df)

speakers = df_speakers_att_chevrolet[['name_qids','name_label','occupations_qids']]
df_chevrolet = process_dataframe_quotes(df_chevrolet, speakers)
df_chevrolet.head(3)

Unnamed: 0,date,name_qids,quotation,quote_qids,Vader_Sentiment,emotion,review,year_month,name_label_x,occupations_qids_x,occupation_label,nationality_label,name_label_y,occupations_qids_y
0,2015-02-04 05:35:55,Q5178606,I've had to go back 10 years of my program fin...,53298,0.0,Surprise,5,2015-02-01,Courtney Force,[Q378622],[racing driver],[United States of America],Courtney Force,[Q378622]
1,2015-10-31 09:27:41,Q5178606,Robert Hight stepped up with Mike Neff and tha...,50268,0.0,Happy,5,2015-10-01,Courtney Force,[Q378622],[racing driver],[United States of America],Courtney Force,[Q378622]
2,2015-07-25 11:08:06,Q5178606,I saw (crew chief Jon) Schaffer running for th...,21150,0.9834,Happy,4,2015-07-01,Courtney Force,[Q378622],[racing driver],[United States of America],Courtney Force,[Q378622]


#### Tesla

In [55]:
df_tesla= pd.read_json('data/tesla_sentiment.json')
df_tesla.head(3)

list_qids_tesla = [x for x in df_tesla.drop_duplicates(['name_qids']).name_qids.values if x!= id_musk]

df = select_speakers(list_qids_tesla)
df_speakers_att_tesla = process_speakers_attributes(df)
df_speakers_att_tesla.head(3)

speakers = df_speakers_att_tesla[['name_qids','name_label','occupations_qids']]
df_tesla = process_dataframe_quotes(df_tesla, speakers)
df_tesla.head(3)

Unnamed: 0,date,name_qids,quotation,quote_qids,Vader_Sentiment,emotion,review,year_month,name_label_x,occupations_qids_x,occupation_label,nationality_label,name_label_y,occupations_qids_y
0,2015-04-17 17:55:00,Q6751189,The policy will concentrate on `A' category st...,76279,0.0,Fear,4,2015-04-01,Manoj Sinha,[Q82955],[politician],[India],Manoj Sinha,[Q82955]
1,2015-10-29 16:31:31,Q4934,As Sergey and I wrote in the original founders...,10100,0.6908,Surprise,4,2015-10-01,Larry Page,"[Q131524, Q82594, Q81096]","[entrepreneur, computer scientist, engineer]",[United States of America],Larry Page,"[Q131524, Q82594, Q81096]"
2,2015-04-20 23:38:36,Q4934,"In the first week of March 2013, Musk reached ...",30429,0.5423,Fear,1,2015-04-01,Larry Page,"[Q131524, Q82594, Q81096]","[entrepreneur, computer scientist, engineer]",[United States of America],Larry Page,"[Q131524, Q82594, Q81096]"


### General Plot

In [56]:
df_chevrolet_plot = df_chevrolet.groupby(by='year_month').mean().reset_index()
df_chevrolet_plot_review = df_chevrolet_plot[['year_month','review']]
df_chevrolet_plot_review['count_'] = df_chevrolet.groupby(by='year_month').count().reset_index()['date']

fig = make_subplots(rows=2, cols=1, row_heights=[2, 1],vertical_spacing=0.1)
fig.add_trace(go.Bar(x=df_chevrolet_plot_review.year_month, y=df_chevrolet_plot_review.count_,),row=1, col=1)
fig.add_trace(go.Scatter(x=df_chevrolet_plot_review.year_month, y=df_chevrolet_plot_review.review),row=2, col=1)
fig.show()

In [57]:
df_tesla_plot = df_tesla.groupby(by='year_month').mean().reset_index()
df_tesla_plot_review = df_tesla_plot[['year_month','review']]
df_tesla_plot_review['count_'] = df_tesla.groupby(by='year_month').count().reset_index()['date']

fig = make_subplots(rows=2, cols=1, row_heights=[2, 1],vertical_spacing=0.1)
fig.add_trace(go.Bar(x=df_tesla_plot_review.year_month, y=df_tesla_plot_review.count_,),row=1, col=1)
fig.add_trace(go.Scatter(x=df_tesla_plot_review.year_month, y=df_tesla_plot_review.review),row=2, col=1)
fig.show()

In [None]:
def find_review_event(sortie, df, n_day):
    df['year_month_day']= df['date'].apply(lambda x :datetime.strptime(str(x)[:10], '%Y-%m-%d'))
    n_days_ago = sortie - dt(days=n_day)
    n_days_after = sortie + dt(days=n_day)
    df_ = df[(df.year_month_day <= n_days_after) & (df.year_month_day >= n_days_ago)]
    return df_

#### Plot with n days before and after the event (days)

In [None]:
# https://www.automoblog.net/chevrolet-introduces-centennial-edition-trucks-kicks-off-100-day-celebration/
sortie = datetime.strptime(str('2018-12-16'), '%Y-%m-%d')
n_day = 30
df_event = find_review_event(sortie, df_chevrolet, n_day)

df_plot = df_event.groupby(by='year_month_day').mean().reset_index()
x = df_event.groupby(by='year_month_day').count().reset_index()['date'].values
df_plot_review = df_plot[['year_month_day','Vader_Sentiment']]
df_plot_review['date'] = x
fig =  px.bar(df_plot_review, x='year_month_day', y='Vader_Sentiment',color='date',title="Review on the Chevrolet quotes between "+str(n_day)+" days before and after.",labels={'year_month':'Time'})
fig.show()