# Data enriching

In [1]:
import pandas as pd
import numpy as np

from scripts.qid_to_label import *

In [2]:
PATH_GENERATED_DATA = 'generated_data/'

PARQUET_FILE = PATH_GENERATED_DATA +  "speaker_attributes.parquet"

In [3]:
%load_ext autoreload
# pour pas avoir besoin de relancer / reimporter quand on modifie les .py
%autoreload 2

## QIDs

**Description:** This section of the notebook is used to convert the QID to readable text after adding speaker attributes to the selected quotes dataframe. Finally at the end of the notebook a .pkl file with the dataframe is saved to be used for other questions

*TO DO :*
 - Find better way to select which QID's to keep when there are many
     - POLITICAL PARTY : find dataset with corresponding dates and only keep party with date == quotation date
 - Fix SettingWithCopyWarning
 - Fix problems which appear when you don't remove rows that have no QID for speaker ( we currently remove them)

In [4]:
# load the selected dataframe
dataframesNames = ('QOI_2015_DF','QOI_2016_DF','QOI_2017_DF','QOI_2018_DF','QOI_2019_DF','QOI_2020_DF')
df = pd.concat([pd.read_pickle(PATH_GENERATED_DATA+ fp +'.pkl') for fp in dataframesNames], ignore_index=True)

In [5]:
len(df)

87161

In [5]:
%time parquet_df = pd.read_parquet(PARQUET_FILE)

Wall time: 16.8 s


In [6]:
%time merged_df = merge_df(df,parquet_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Wall time: 7.68 s


In [7]:
qids_onlyquotebank_path = PATH_GENERATED_DATA + "wikidata_labels_descriptions_quotebank.csv.bz2"
column_names = ('nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'religion')
%time merged_df = qid_to_label(merged_df, qids_onlyquotebank_path, column_names)

Wall time: 1.25 s


In [8]:
merged_df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,aliases,...,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,2015-03-09-004706,Anything less than women winning 50 per cent o...,Katy Gallagher,Q463507,2015-03-09 12:30:00,1,"[[Katy Gallagher, 0.5872], [None, 0.4128]]",[http://www.smh.com.au/act-news/women-need-to-...,E,"[Katherine Gallagher, Katherine Ruth Gallagher]",...,,,politician,Australian Labor Party,,Q463507,Katy Gallagher,,item,
1,2017-05-11-081817,"more family-friendly and flexible workplaces, ...",Katy Gallagher,Q463507,2017-05-11 00:00:37,1,"[[Katy Gallagher, 0.5596], [None, 0.4404]]",[http://www.mediamaxnetwork.co.ke/news/325569/...,E,"[Katherine Gallagher, Katherine Ruth Gallagher]",...,,,politician,Australian Labor Party,,Q463507,Katy Gallagher,,item,
2,2017-06-23-130636,We need more women and parents in Parliament. ...,Katy Gallagher,Q463507,2017-06-23 03:20:00,1,"[[Katy Gallagher, 0.4996], [None, 0.4403], [La...",[http://www.harpersbazaar.com/culture/features...,E,"[Katherine Gallagher, Katherine Ruth Gallagher]",...,,,politician,Australian Labor Party,,Q463507,Katy Gallagher,,item,
3,2015-04-24-025718,I'd like to congratulate all the winners and f...,Helena Morrissey,Q23762081,2015-04-24 15:33:00,1,"[[Helena Morrissey, 0.8706], [None, 0.1294]]",[http://www.cipd.co.uk/PM/peoplemanagement/b/w...,E,,...,,,business executive,,,Q23762081,Helena Morrissey,,item,
4,2015-04-08-011609,Clearly we have got a long way to go before we...,Helena Morrissey,Q23762081,2015-04-08 18:28:01,1,"[[Helena Morrissey, 0.5805], [None, 0.2061], [...",[http://feeds.theguardian.com/c/34708/f/663879...,E,,...,,,business executive,,,Q23762081,Helena Morrissey,,item,


In [9]:
(merged_df['phase'] == 'E').sum() == len(merged_df.index)
# no point in keeping the phase column

True

In [10]:
df_enriched = merged_df.drop(columns = ['qids','id','phase'])

In [11]:
df_enriched.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,probas,urls,aliases,date_of_birth,nationality,...,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
0,2015-03-09-004706,Anything less than women winning 50 per cent o...,Katy Gallagher,2015-03-09 12:30:00,1,"[[Katy Gallagher, 0.5872], [None, 0.4128]]",[http://www.smh.com.au/act-news/women-need-to-...,"[Katherine Gallagher, Katherine Ruth Gallagher]",[+1970-03-17T00:00:00Z],Australia,...,1349152158,,,politician,Australian Labor Party,,Katy Gallagher,,item,
1,2017-05-11-081817,"more family-friendly and flexible workplaces, ...",Katy Gallagher,2017-05-11 00:00:37,1,"[[Katy Gallagher, 0.5596], [None, 0.4404]]",[http://www.mediamaxnetwork.co.ke/news/325569/...,"[Katherine Gallagher, Katherine Ruth Gallagher]",[+1970-03-17T00:00:00Z],Australia,...,1349152158,,,politician,Australian Labor Party,,Katy Gallagher,,item,
2,2017-06-23-130636,We need more women and parents in Parliament. ...,Katy Gallagher,2017-06-23 03:20:00,1,"[[Katy Gallagher, 0.4996], [None, 0.4403], [La...",[http://www.harpersbazaar.com/culture/features...,"[Katherine Gallagher, Katherine Ruth Gallagher]",[+1970-03-17T00:00:00Z],Australia,...,1349152158,,,politician,Australian Labor Party,,Katy Gallagher,,item,
3,2015-04-24-025718,I'd like to congratulate all the winners and f...,Helena Morrissey,2015-04-24 15:33:00,1,"[[Helena Morrissey, 0.8706], [None, 0.1294]]",[http://www.cipd.co.uk/PM/peoplemanagement/b/w...,,[+1966-03-22T00:00:00Z],United Kingdom,...,1393002378,,,business executive,,,Helena Morrissey,,item,
4,2015-04-08-011609,Clearly we have got a long way to go before we...,Helena Morrissey,2015-04-08 18:28:01,1,"[[Helena Morrissey, 0.5805], [None, 0.2061], [...",[http://feeds.theguardian.com/c/34708/f/663879...,,[+1966-03-22T00:00:00Z],United Kingdom,...,1393002378,,,business executive,,,Helena Morrissey,,item,


In [14]:
# Save to pickle
df_enriched.to_pickle(PATH_GENERATED_DATA+'df_enriched.pkl')

In [15]:
len(df_enriched.index)

55328