In [101]:
import pandas as pd
import numpy as np
import bz2
import json
from datetime import datetime
import plotly.express as px
from datetime import timedelta as dt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.options.mode.chained_assignment = None

#### Dict and DF useful for this notebook

In [102]:
from_qids_to_label_occupation = pd.read_json('data/occupations.json') 
from_qids_to_label_ethnics = pd.read_json('data/ethnic.json' ) 
from_qids_to_label_religion = pd.read_json('data/religion.json') 
from_qids_to_label_nationality = pd.read_json('data/nationality.json') 
from_qids_to_label_party = pd.read_json('data/party.json') 
from_qids_to_label_gender = pd.read_json('data/gender.json') 

df_speakers_attributes = pd.read_parquet('data/speaker_attributes.parquet')

In [103]:
id_musk = df_speakers_attributes[df_speakers_attributes.label =='Elon Musk'].id.values[0]

### Process for the speaker attributs

In [104]:
def map_qids_labels(list_qids,df_):
    """Transform a list of QIDS into a list of labels, return 0 if the list is empty (which is equal to 0 in the dataframe)."""
    if(type(list_qids)!=int):
        return [df_[df_.qids == x].values[0][1] for x in list_qids if x!=0 and len(df_[df_.qids == x].values)>=1]    
    else:
        return 0  

In [105]:
def process_speakers_attributes(df_speakers_attributes):
    """Add the columns for the party, occupation, religion, ethnics and nationality their labels. Take as input the dataframe with the selected speakers and return the same dataframe with the columns added."""
    df = df_speakers_attributes[['date_of_birth','nationality','gender','ethnic_group','occupation','id','label','religion','party']]
    df = df.rename(columns={'date_of_birth':'birth','nationality':'nationality_qids','gender':'gender_qids','ethnic_group':'ethnics_qids','occupation':'occupations_qids','id':'name_qids','label':'name_label','religion':'religion_qids','party':'party_qids','gender':'gender_qids'})
    # Add labels for occupations, ethincs, religion, nationality, party
    df = df.fillna(0)
    df['occupation_label']= df['occupations_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_occupation))
    df['ethnics_label']= df['ethnics_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_ethnics))
    df['religion_label']= df['religion_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_religion))
    df['nationality_label']= df['nationality_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_nationality))
    df['party_label']= df['party_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_party))
    df['gender_label']= df['gender_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_gender))
    return df
    

#### Select only the QIDS needed in the analysis

In [106]:
def select_speakers(list_qids):
    """From a list of QIDS of speakers, return the dataframe with only the speakers that are in the list."""
    return df_speakers_attributes[df_speakers_attributes.id.isin(list_qids)]

### Process for the quotes dataframe

In [None]:
def process_dataframe_quotes(df_sent, speakers):
    """ Transform the date into a datetime object ('%Y-%m-%d %H:%M:%S). Join the quotes dataframe with the speakers dataframe on the name_qids."""
    
    # Rename the columns and keep only the one needed
    df_1 = df_sent[['name_qids','quotation','quote_qids','review','Vader_Sentiment']]
    
    # Change timestamp in datetime object and add the perdio depending on the time_range list
    df_sent['date']= df_sent['date'].apply(lambda x :datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S'))
    df_sent['year_month']= df_sent['date'].apply(lambda x :datetime.strptime(str(x)[:7], '%Y-%m'))
    
    #Join the dataframe with the speakers attributes on the name_qids
    df = df_sent.merge(speakers, on='name_qids')
    return df
  

#### Chevrolet

In [None]:
df_chevrolet= pd.read_json('data/chevrolet_sentiment.json')

list_qids_chevrolet = df_chevrolet.drop_duplicates(['name_qids']).name_qids.values
df = select_speakers(list_qids_chevrolet)
df_speakers_att_chevrolet = process_speakers_attributes(df)

speakers = df_speakers_att_chevrolet[['name_qids','name_label','occupations_qids']]
df_chevrolet = process_dataframe_quotes(df_chevrolet, speakers)
df_chevrolet.head(3)

#### Tesla

In [126]:
df_tesla= pd.read_json('data/tesla_sentiment.json')
df_tesla.head(3)

#list_qids_tesla = [x for x in df_tesla.drop_duplicates(['name_qids']).name_qids.values if x!= id_musk]
list_qids_tesla = [x for x in df_tesla.drop_duplicates(['name_qids']).name_qids.values]

df = select_speakers(list_qids_tesla)
df_speakers_att_tesla = process_speakers_attributes(df)
df_speakers_att_tesla.head(3)

speakers = df_speakers_att_tesla[['name_qids','name_label','occupations_qids']]
df_tesla = process_dataframe_quotes(df_tesla, speakers)
df_tesla.head(3)

Unnamed: 0,date,name_qids,quotation,quote_qids,Vader_Sentiment,emotion,review,year_month,name_label_x,occupations_qids_x,occupation_label,nationality_label,name_label_y,occupations_qids_y
0,2015-04-17 17:55:00,Q6751189,The policy will concentrate on `A' category st...,76279,0.0,Fear,4,2015-04-01,Manoj Sinha,[Q82955],[politician],[India],Manoj Sinha,[Q82955]
1,2015-10-29 16:31:31,Q4934,As Sergey and I wrote in the original founders...,10100,0.6908,Surprise,4,2015-10-01,Larry Page,"[Q131524, Q82594, Q81096]","[entrepreneur, computer scientist, engineer]",[United States of America],Larry Page,"[Q131524, Q82594, Q81096]"
2,2015-04-20 23:38:36,Q4934,"In the first week of March 2013, Musk reached ...",30429,0.5423,Fear,1,2015-04-01,Larry Page,"[Q131524, Q82594, Q81096]","[entrepreneur, computer scientist, engineer]",[United States of America],Larry Page,"[Q131524, Q82594, Q81096]"


#### Toyota

In [108]:
df_toyota= pd.read_json('data/toyota_sentiment.json')
df_toyota.head(3)

list_qids_toyota = [x for x in df_toyota.drop_duplicates(['name_qids']).name_qids.values if x!= id_musk]

df = select_speakers(list_qids_toyota)
df_speakers_att_toyota = process_speakers_attributes(df)
df_speakers_att_toyota.head(3)

speakers = df_speakers_att_toyota[['name_qids','name_label','occupations_qids']]
df_toyota = process_dataframe_quotes(df_toyota, speakers)
df_toyota.head(3)

Unnamed: 0,date,name_qids,quotation,quote_qids,Vader_Sentiment,emotion,review,year_month,name_label_x,occupations_qids_x,occupation_label,nationality_label,name_label_y,occupations_qids_y
0,2015-06-22 19:28:00,Q1701249,It's like seeing your father cry for the first...,23803,-0.1531,Fear,4,2015-06-01,John Oliver,"[Q36834, Q158852]","[composer, conductor]",[United States of America],John Oliver,"[Q36834, Q158852]"
1,2015-08-20 01:36:57,Q1701249,as morally dubious as Toyota's short-lived slo...,9746,-0.4767,Angry,1,2015-08-01,John Oliver,"[Q36834, Q158852]","[composer, conductor]",[United States of America],John Oliver,"[Q36834, Q158852]"
2,2015-11-17 03:05:31,Q1701249,Toyota is famous for making dependable vehicle...,116071,0.2732,Surprise,3,2015-11-01,John Oliver,"[Q36834, Q158852]","[composer, conductor]",[United States of America],John Oliver,"[Q36834, Q158852]"


### DO TO TATY
- rajouter les axes et titres pour les general plots (tesla, chevrolet, toyota)
- proportion of quotes/removed
- boxplot des events (tesla 2x, chevrolet 2x) et commenter
- piechart des occupations/nationalité (tesla vs chevrolet)


### General Plot

In [127]:
df_chevrolet_plot = df_chevrolet.groupby(by='year_month').mean().reset_index()
df_chevrolet_plot_review = df_chevrolet_plot[['year_month','Vader_Sentiment']]
df_chevrolet_plot_review['count_'] = df_chevrolet.groupby(by='year_month').count().reset_index()['date']

fig = make_subplots(rows=2, cols=1, row_heights=[2, 1],vertical_spacing=0.1)
fig.add_trace(go.Bar(x=df_chevrolet_plot_review.year_month, y=df_chevrolet_plot_review.count_,name="Number of quotes groupy by months"),row=1, col=1 )
fig.add_trace(go.Scatter(x=df_chevrolet_plot_review.year_month, y=df_chevrolet_plot_review.Vader_Sentiment,name="Vader sentiment score by month (mean)"),row=2, col=1)
fig.update_layout(
    title="General Analysis on the Chevrolet's quotes ")
fig.show()

In [128]:
df_tesla_plot = df_tesla.groupby(by='year_month').mean().reset_index()
df_tesla_plot_review = df_tesla_plot[['year_month','Vader_Sentiment']]
df_tesla_plot_review['count_'] = df_tesla.groupby(by='year_month').count().reset_index()['date']

fig = make_subplots(rows=2, cols=1, row_heights=[2, 1],vertical_spacing=0.1)
fig.add_trace(go.Bar(x=df_tesla_plot_review.year_month, y=df_tesla_plot_review.count_,name="Number of quotes groupy by months"),row=1, col=1)
fig.add_trace(go.Scatter(x=df_tesla_plot_review.year_month, y=df_tesla_plot_review.Vader_Sentiment,name="Vader sentiment score by month (mean)"),row=2, col=1)
fig.update_layout(
    title="General Analysis on the Tesla's quotes ")
fig.show()

In [129]:
df_toyota_plot = df_toyota.groupby(by='year_month').mean().reset_index()
df_toyota_plot_review = df_toyota_plot[['year_month','Vader_Sentiment']]
df_toyota_plot_review['count_'] = df_toyota.groupby(by='year_month').count().reset_index()['date']

fig = make_subplots(rows=2, cols=1, row_heights=[2, 1],vertical_spacing=0.1)
fig.add_trace(go.Bar(x=df_toyota_plot_review.year_month, y=df_toyota_plot_review.count_,name="Number of quotes groupy by months"),row=1, col=1)
fig.add_trace(go.Scatter(x=df_toyota_plot_review.year_month, y=df_toyota_plot_review.Vader_Sentiment,name="Vader sentiment score by month (mean)"),row=2, col=1)
fig.update_layout(
    title="General Analysis on the Toyota's quotes ")
fig.show()

In [130]:
def find_review_event(sortie, df, n_day,num_day_event):
    df['year_month_day']= df['date'].apply(lambda x :datetime.strptime(str(x)[:10], '%Y-%m-%d'))
    start_event = sortie - dt(days=num_day_event)
    end_event = sortie + dt(days=num_day_event)
    n_days_ago = sortie - dt(days=n_day+num_day_event)
    n_days_after = sortie + dt(days=n_day+num_day_event)
    
    df_event = df[(df.year_month_day <= end_event) & (df.year_month_day >= start_event)]
    df_event['type'] = ['during']*df_event.shape[0]
    df_before_event = df[(df.year_month_day < start_event) & (df.year_month_day > n_days_ago)]
    df_before_event['type'] = ['before']*df_before_event.shape[0]
    df_after_event = df[(df.year_month_day < n_days_after) & (df.year_month_day > end_event)]
    df_after_event['type'] = ['after']*df_after_event.shape[0]
    
    return pd.concat([df_before_event, df_event, df_after_event])

## BoxPlot Event

### Chevrolet

In [114]:
# Released of the Silverado 2019
sortie_chev_1 = datetime.strptime(str('2019-01-15'), '%Y-%m-%d')
n_day = 20
num_day_event = 10
df  = find_review_event(sortie_chev_1, df_chevrolet, n_day, num_day_event)

fig = px.box(df, x="type", y='Vader_Sentiment')
fig.update_traces(quartilemethod="exclusive")
fig.update_layout(title="Released of the Silverado 2019",
    yaxis_title='Vader Sentiment Score',xaxis_title='Periods compared to the event')
fig.show()

In [137]:
# Released of the Camaro (6th generation), April 2018
sortie_chev_1 = datetime.strptime(str('2018-04-15'), '%Y-%m-%d')
n_day = 20
num_day_event = 10
df  = find_review_event(sortie_chev_1, df_chevrolet, n_day, num_day_event)

fig = px.box(df, x="type", y='Vader_Sentiment')
fig.update_traces(quartilemethod="exclusive")
fig.update_layout(title="Released of the Camaro 2018 ",
    yaxis_title='Vader Sentiment Score',xaxis_title='Periods compared to the event')
fig.show()

In [145]:
# Toyota Yaris (February, 2020)
sortie_tesla_1 = datetime.strptime(str('2020-02-10'), '%Y-%m-%d')
n_day = 20
num_day_event = 10
df  = find_review_event(sortie_tesla_1, df_tesla, n_day, num_day_event)

fig = px.box(df, x="type", y='Vader_Sentiment')
fig.update_traces(quartilemethod="exclusive")
fig.update_layout(title="Toyota Yaris (February, 2020)",
    yaxis_title='Vader Sentiment Score',xaxis_title='Periods compared to the event')
fig.show()

In [135]:
# Release Cyber Truck 2019 (November)
sortie_tesla_1 = datetime.strptime(str('2019-11-21'), '%Y-%m-%d')
n_day = 20
num_day_event = 10
df  = find_review_event(sortie_tesla_1, df_tesla, n_day, num_day_event)

fig = px.box(df, x="type", y='Vader_Sentiment')
fig.update_traces(quartilemethod="exclusive")
fig.update_layout(title="Release Cyber Truck 2019 (November)",
    yaxis_title='Vader Sentiment Score',xaxis_title='Periods compared to the event')
fig.show()

In [136]:
# Release Model X (September 2015)
sortie_tesla_1 = datetime.strptime(str('2015-09-15'), '%Y-%m-%d')
n_day = 20
num_day_event = 10
df  = find_review_event(sortie_tesla_1, df_tesla, n_day, num_day_event)

fig = px.box(df, x="type", y='Vader_Sentiment')
fig.update_traces(quartilemethod="exclusive")
fig.update_layout(title="Release Model X (September 2015)",
    yaxis_title='Vader Sentiment Score',xaxis_title='Periods compared to the event')
fig.show()

### PieChart

In [141]:
def update_qids_freq(df_old, percentage, type_):
    df_ = df_old.explode(type_).groupby(by=type_).count().reset_index()
    tot = sum(df_['date'])
    df_['percentage'] = df_['date'].apply(lambda x: (x*100)/tot)
    df_new = df_[df_['percentage'] > percentage]
    df_new = df_new[df_new[type_] != 0]
    return df_new

#### Occupations 

In [142]:
percentage = 1
# Tesla
df_t = update_qids_freq(df_tesla, percentage,'occupation_label')
#df_t = df_tesla.explode('occupation_label').groupby(by='occupation_label').count().reset_index()
#df_t.loc[df_t['date'] < 150, 'occupation_label'] = 'Others' 

# Chevrolet 
df_c = update_qids_freq(df_chevrolet, percentage,'occupation_label')
#df_c = df_chevrolet.explode('occupation_label').groupby(by='occupation_label').count().reset_index()
#df_c.loc[df_c['date'] < 100, 'occupation_label'] = 'Others'

# Create Figure
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]],subplot_titles=('Tesla',  'Chevrolet'))
fig.add_trace(go.Pie(labels=df_t.occupation_label, values=df_t.date),row=1, col=1)
fig.add_trace(go.Pie(labels=df_c.occupation_label, values=df_c.date),row=1, col=2)

fig.show()

#### Nationality

In [144]:
percentage = 0.5
# Tesla
df_t = update_qids_freq(df_tesla, percentage,'nationality_label')
#df_t = df_tesla.explode('occupation_label').groupby(by='occupation_label').count().reset_index()
#df_t.loc[df_t['date'] < 150, 'occupation_label'] = 'Others' 

# Chevrolet 
df_c = update_qids_freq(df_chevrolet, percentage,'nationality_label')

# Toyota 
df_toyo = update_qids_freq(df_toyota, percentage,'nationality_label')
#df_c = df_chevrolet.explode('occupation_label').groupby(by='occupation_label').count().reset_index()
#df_c.loc[df_c['date'] < 100, 'occupation_label'] = 'Others'
fig = make_subplots(rows=1, cols=3, specs=[[{"type": "pie"}, {"type": "pie"},{"type": "pie"}]],subplot_titles=('Tesla',  'Chevrolet','Toyota'))
fig.add_trace(go.Pie(labels=df_t.nationality_label, values=df_t.date),row=1, col=1)
fig.add_trace(go.Pie(labels=df_c.nationality_label, values=df_c.date),row=1, col=2)
fig.add_trace(go.Pie(labels=df_toyo.nationality_label, values=df_toyo.date),row=1, col=3)
fig.show()