In [9]:
#importing the required modules
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import numpy as np
import seaborn as sns

In [10]:
from tqdm import tqdm, notebook
notebook.tqdm().pandas()

0it [00:00, ?it/s]

In [11]:
# Small adjustments to default style of plots, making sure it's readable and colorblind-friendly everywhere
plt.style.use('seaborn-colorblind')
plt.rcParams.update({'font.size' : 12.5,
                     'figure.figsize':(25,7)})

Get the path to retrieve data from local folder

In [12]:
#Jules: 
#Small sample
path = r'/Users/jules/kDrive/onedrive/Documents_Onedrive/EPFL/MA3/ADA/Project/Dataset/'

#Big sample
#path = r'C:\Users\jules\kDrive\onedrive\Documents_Onedrive\EPFL\MA3\ADA\Project\Dataset\BIG_SAMPLE'

#### Using the provided csv file to link qids with meaningful names for speakers metadata

In [13]:
df_lookup = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2',compression = 'bz2',index_col='QID')

## Speakers_metadata

### Retrieve all americans politicians

In [14]:
#drop NaNs in occupation
def get_polUS(df_speakers):
    df_speakers.dropna(axis=0,inplace=True,subset=['occupation','nationality'])
    assert df_speakers['occupation'].isna().sum() == 0
    assert df_speakers['nationality'].isna().sum() == 0
    
    pol_cond = df_speakers['occupation'].apply(lambda x : any(item in 'Q82955' for item in x)) 
    us_cond = df_speakers['nationality'].apply(lambda x : any(item in 'Q30' for item in x)) #Q142 for french 

    df_polUS = df_speakers.loc[us_cond & pol_cond]

    return df_polUS


### Create the age column

In [15]:
from datetime import date, datetime

def get_age(df_speakers):
    
    #Define the function to calculate the age of speakers 
    def calculate_age(born):
        today = date.today()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    
    def str2datetime(string):
        date_format = '+%Y-%m-%dT%H:%M:%SZ'
        try:
            dt = datetime.strptime(string,date_format)
        except:
            dt = None
        return dt
    
    df = df_speakers.copy()
    
    #Convert list into string
    df['date_of_birth'] = df['date_of_birth'].apply(lambda x: x[0] if x is not None else None)
    
    #Retrieve date_of_birth with 00-00 as month/day and replace it by the 01-01
    df['date_of_birth']=df['date_of_birth'].apply(lambda x: x.replace('-00','-01') if x is not None else None)

    #Transform date_of_birth from string to datetime
    df['date_of_birth'] = df['date_of_birth'].apply(lambda x: str2datetime(x)if x is not None else None)
    
    # Calculate the age of each speakers
    df['age'] =  df['date_of_birth'].apply(lambda x: calculate_age(x) if x is not None else None)
    
    #Detect ages that are incoherent and drop the rows 
    
    df.drop(df[df['age']<15].index, inplace=True)
    df.drop(df[df['age']>110].index, inplace=True)
    
    return df

### Create a column containg either "Democrate" or "Republican" party

QID Republican : Q29468

QID Democratic Q29552

In [16]:
def get_bi_party(df_speakers):
    df = df_speakers.copy()
    
    demo_cond = df_speakers['party'].apply(lambda x : any(item in 'Q29552' for item in x) if x is not None else False)
    repu_cond = df_speakers['party'].apply(lambda x : any(item in 'Q29468' for item in x) if x is not None else False)
    
    df['bi_party']=None
    
    df.loc[demo_cond,'bi_party'] = 'Democrat'
    df.loc[repu_cond,'bi_party'] = 'Republican'
    
    return df

##  Create the dataset for linear regression 

In [17]:
def speakers_metadata(df_speakers):
    df = get_polUS(df_speakers)
    # shift column 'label' to first position
    first_column = df.pop('label')
    # insert column using insert(position,column_name,first_column) function
    df.insert(0, 'label', first_column)
    
    #Create the age column
    df = get_age(df)
    
    #Create the bi-party column
    df = get_bi_party(df)
    
    return df

In [18]:
def ready_for_lr(file_path,df_speakers):

    #Import quotes dataset (classified)
    df_quotes = pd.read_json(file_path,compression = 'bz2',lines = True)

    #Choose the first QID that is associated with (to be improved)
    df_quotes['qid_unique'] = df_quotes['qids'].apply(lambda x: x[0])

    #Drop the useless columns 
    col_useless = ['qids','probas','numOccurrences','phase']
    df_quotes.drop(col_useless,axis=1,inplace=True)
    
    # shift column 'label' to first position
    first_column = df_quotes.pop('qid_unique')
    # insert column using insert(position,column_name,first_column) function
    df_quotes.insert(3, 'qid_unique', first_column)
    
    #Merge the quote with the speakers metadata
    df_quotes_merged = df_quotes.merge(df_speakers,how='inner',left_on='qid_unique', right_on='id')
    
    return df_quotes_merged

In [19]:
#Get the speakers metadata
path = r'/Users/jules/kDrive/onedrive/Documents_Onedrive/EPFL/MA3/ADA/Project/Dataset/'
df_speakers = pd.read_parquet(path+'speaker_attributes.parquet')


In [20]:
df_speakers_pol=speakers_metadata(df_speakers)

In [21]:
df_speakers_pol['age']

34163       99.0
37421       99.0
38988      108.0
40213      104.0
40918       51.0
           ...  
5655696      NaN
5655697      NaN
5655698      NaN
5655699      NaN
5655702      NaN
Name: age, Length: 18047, dtype: float64

In [None]:
#Process the sample

path = r'/Users/jules/kDrive/onedrive/Documents_Onedrive/EPFL/MA3/ADA/Project/Dataset/BIG_SAMPLE/'
file_path = path+'Sample_2015_classified.json.bz2'

#path = r'/Users/jules/kDrive/onedrive/Documents_Onedrive/EPFL/MA3/ADA/Project/Dataset/'
#file_path = path+'Sample_classified_1Mio_v1.json.bz2'

df_speakers_pol = speakers_metadata(df_speakers)
df = ready_for_lr(file_path, df_speakers_pol)

In [None]:
df.columns

In [None]:
df.to_json(path+'/Results_LR/df_LR_2015.json.bz2',compression='bz2',lines=True,orient="records")