In [None]:
import pandas as pd
import numpy as np
import bz2
import json
import findspark
findspark.init('/Users/tatianacogne/spark')
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from datetime import datetime

#### Dict and DF useful for this notebook

In [None]:
from_qids_to_label_occupation = pd.read_json('data/occupations.json') 
from_qids_to_label_ethnics = pd.read_json('data/ethnic.json' ) 
from_qids_to_label_religion = pd.read_json('data/religion.json') 
from_qids_to_label_nationality = pd.read_json('data/nationality.json') 
from_qids_to_label_party = pd.read_json('data/party.json') 
from_qids_to_label_gender = pd.read_json('data/gender.json') 

df_speakers_attributes = pd.read_parquet('data/speaker_attributes.parquet')

#### Create a spark context 

In [None]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### Process for the speaker attributs

In [None]:
def map_qids_labels(list_qids,df_):
    """Transform a list of QIDS into a list of labels, return 0 if the list is empty (which is equal to 0 in the dataframe)."""
    if(type(list_qids)!=int):
        return [df_[df_.qids == x].values[0][1] for x in list_qids if x!=0]    
    else:
        return 0  

In [None]:
def process_speakers_attributes(df_speakers_attributes):
    """Add the columns for the party, occupation, religion, ethnics and nationality their labels. Take as input the dataframe with the selected speakers and return the same dataframe with the columns added."""
    df = df_speakers_attributes[['date_of_birth','nationality','gender','ethnic_group','occupation','id','label','religion','party']]
    df = df.rename(columns={'date_of_birth':'birth','nationality':'nationality_qids','gender':'gender_qids','ethnic_group':'ethnics_qids','occupation':'occupations_qids','id':'name_qids','label':'name_label','religion':'religion_qids','party':'party_qids','gender':'gender_qids'})
    # Add labels for occupations, ethincs, religion, nationality, party
    df = df.fillna(0)
    df['occupation_label']= df['occupations_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_occupation))
    df['ethnics_label']= df['ethnics_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_ethnics))
    df['religion_label']= df['religion_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_religion))
    df['nationality_label']= df['nationality_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_nationality))
    df['party_label']= df['party_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_party))
    df['gender_label']= df['gender_qids'].apply(lambda x : map_qids_labels(x,from_qids_to_label_gender))
    return df
    

#### Select only the QIDS needed in the analysis

In [None]:
def select_speakers(list_qids):
    """From a list of QIDS of speakers, return the dataframe with only the speakers that are in the list."""
    return df_speakers_attributes[df_speakers_attributes.id.isin(list_qids)]

### Process for the quotes dataframe

#### Open quotes selected,processed and analysed in part 1 & part 2 (all years)

In [None]:
df_sent = pd.read_json('data/sentiment_quotes.json')
df_sent.head(3)

#### Split quotes by timeslots

In [None]:
def select_period(x,time_range):
    if(x < time_range[0]):
        return 'before'
    if(time_range[0] <= x and x <= time_range[1]):
        return 'during'
    else:
        return 'after'

#### Change date and add the period of the quotes depending on the event

In [None]:
def process_dataframe_quotes(df_sent, time_range, speakers, explode_by):
    """ Transform the date into a datetime object ('%Y-%m-%d %H:%M:%S)
        Split with the time_range the dataframe df. Add a columns 'period' with the label before, during or after. 
        Join the quotes dataframe with the speakers dataframe on the name_qids.
        Explode the dataframe depending on the column(s) in explode_by"""
    
    # Rename the columns and keep only the one needed
    df_sent = df_sent.rename(columns={'qids':'name_qids','quoteID':'quote_qids','Emotion':'emotion'})
    df_sent = df_sent[['date','name_qids','quotation','quote_qids','Vader_Sentiment','emotion','review']]
    # Change timestamp in datetime object and add the perdio depending on the time_range list
    df_sent['date']= df_sent['date'].apply(lambda x :datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S'))
    df_sent['year_month']= df_sent['date'].apply(lambda x :datetime.strptime(str(x)[:7], '%Y-%m'))
    
    #df_sent['period'] = df_sent['date'].apply(lambda x : select_period(x,time_range))
    
    #Join the dataframe with the speakers attributes on the name_qids
    df = df_sent.merge(speakers, on='name_qids')

    # Explode the df by the column(s) in explode_by
    #df = df.explode(explode_by)
    
    return df
  

### WORK 

In [None]:
list_qids = df_sent.drop_duplicates(['qids']).qids.values
df = select_speakers(list_qids)
df_speakers_att = process_speakers_attributes(df)
df_speakers_att.head(3)

In [None]:
start = datetime.strptime(str('2017-07-28 00:00:00'), '%Y-%m-%d %H:%M:%S')
end = datetime.strptime(str('2017-07-28 23:59:59'), '%Y-%m-%d %H:%M:%S')
time_range = [start,end]
speakers = df_speakers_att[['name_qids','name_label','occupations_qids']]
explode_by = 'occupations_qids'
df_sent = process_dataframe_quotes(df_sent, time_range, speakers, explode_by)
df_sent

In [None]:
test = df_sent.explode('occupations_qids')
test

#### Plot 

In [None]:
import plotly.express as px

fig = px.bar(df_sent, x="period", color="emotion", category_orders={"period": ["before",'during','after']})
fig.show()

In [None]:
df = df_sent.groupby(by=['name_label','emotion']).count().reset_index()
df = df[['name_label','emotion','date']].rename(columns={'date':'count'})

In [None]:
test

In [None]:
#df = df.sort_values(by=["company_count", "Category"])
fig = px.bar(
        df,
        y='name_label',
        x="count",
        color="emotion",
        barmode="stack",
        template="plotly_white",
        orientation="h",
        hover_name="name_label",
        text="emotion",
        color_discrete_sequence=px.colors.qualitative.Vivid,
    )
fig.update_yaxes(
        type="category",
        categoryorder="total ascending")

fig.update_layout(
    autosize=False,
    width=1500,
    height=800,)