## Data processing
This notebook allows you to filter the original datasets in order to keep only what you need for the project. The second part is dedicated to the QIDS associated to Wikidata in order to find the labels of each of them. These labels will be useful to interpret the results at the end 

In [None]:
import pandas as pd
import numpy as np
import bz2
import json
from collections import Counter
from datetime import datetime
import sys
import urllib.request as r
import urllib.request
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
from urllib.request import urlopen, Request
import csv

In [None]:
# Create a spark context
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### Part 1: Filtering the dataset 
This part 'filter' the original datasets by year in order to take only the date, quoteID, the quotation and the QIDS of the speaker. 

#### 2020

In [None]:
path_to_file = 'data/quotes-2020.json.bz2' 
path_to_out = 'data/sp_qids_quotes-2020-process.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #instance['date'] = datetime.strptime(instance['date'], '%Y-%m-%d %H:%M:%S') 
            instance['quoteID']= instance['quoteID'][11:]
            if(len(instance['qids'])!=0):
                instance['qids'] = instance['qids'][0]
                instance = dict((key,value) for key, value in instance.items() if key in('date','quoteID','quotation','qids'))
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

#### 2019

In [None]:
path_to_file = 'data/quotes-2019.json.bz2' 
path_to_out = 'data/quotes-2019-process.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #instance['date'] = datetime.strptime(instance['date'], '%Y-%m-%d %H:%M:%S') 
            instance['quoteID']= instance['quoteID'][11:]
            if(len(instance['qids'])!=0):
                instance['qids'] = instance['qids'][0]
                instance = dict((key,value) for key, value in instance.items() if key in('date','quoteID','quotation','qids'))
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

#### 2018

In [None]:
path_to_file = 'data/quotes-2018.json.bz2' 
path_to_out = 'data/quotes-2018-process.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #instance['date'] = datetime.strptime(instance['date'], '%Y-%m-%d %H:%M:%S') 
            instance['quoteID']= instance['quoteID'][11:]
            if(len(instance['qids'])!=0):
                instance['qids'] = instance['qids'][0]
                instance = dict((key,value) for key, value in instance.items() if key in('date','quoteID','quotation','qids'))
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

#### 2017

In [None]:
path_to_file = 'data/quotes-2017.json.bz2' 
path_to_out = 'data/quotes-2017-process.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #instance['date'] = datetime.strptime(instance['date'], '%Y-%m-%d %H:%M:%S') 
            instance['quoteID']= instance['quoteID'][11:]
            if(len(instance['qids'])!=0):
                instance['qids'] = instance['qids'][0]
                instance = dict((key,value) for key, value in instance.items() if key in('date','quoteID','quotation','qids'))
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

#### 2016

In [None]:
path_to_file = 'data/quotes-2016.json.bz2' 
path_to_out = 'data/quotes-2016-process.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #instance['date'] = datetime.strptime(instance['date'], '%Y-%m-%d %H:%M:%S') 
            instance['quoteID']= instance['quoteID'][11:]
            if(len(instance['qids'])!=0):
                instance['qids'] = instance['qids'][0]
                instance = dict((key,value) for key, value in instance.items() if key in('date','quoteID','quotation','qids'))
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

#### 2015

In [None]:
path_to_file = 'data/quotes-2015.json.bz2' 
path_to_out = 'data/quotes-2015-process.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #instance['date'] = datetime.strptime(instance['date'], '%Y-%m-%d %H:%M:%S') 
            instance['quoteID']= instance['quoteID'][11:]
            if(len(instance['qids'])!=0):
                instance['qids'] = instance['qids'][0]
                instance = dict((key,value) for key, value in instance.items() if key in('date','quoteID','quotation','qids'))
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

### Part 2: Speaker attributes
This part allows you to take all the QIDS that may be useful and find their labels.

Upload data against speaker attributes. Rename also the columns to keep a certain coherence in the following processing.

In [None]:
df_sp = pd.read_parquet('data/speaker_attributes.parquet')
df_sp = df_sp[['id','label','nationality','occupation','ethnic_group', 'party','religion']].copy()
df_sp = df_sp.rename(columns={"nationality": "nationality_qids", "ethnic_group": "ethnic_qids","occupation":"occupation_qids","party":'party_qids','id':'speaker_qids','label':'speaker_label','religion':'religion_qids'})

List of all the QIDS found in the atrributes of the speakers. Each column in the speaker dataframe has its own QIDS list and the labels are also retrieved column by column. A dictionary is then created for each column with the QIDS and the corresponding label. Some urls did not work (Error 404 example) and they are put in comment each time. 
There were more than 14'000 occupations, we had to separate the list into several sub-lists in order to find the labels. 

In [None]:
nationality_qids = df_sp['nationality_qids'].explode('nationality_qids').drop_duplicates().values
occupation_qids = df_sp['occupation_qids'].explode('occupation_qids').drop_duplicates().values
ethnic_qids = df_sp['ethnic_qids'].explode('ethnic_qids').drop_duplicates().values
party_qids = df_sp['party_qids'].explode('party_qids').drop_duplicates().values
religion_qids = df_sp['religion_qids'].explode('religion_qids').drop_duplicates()

In [None]:
def find_label(qids):
    """Use Beautiful soup parser in order to find for each QIDS their label on their Wikidata's page"""
    url = 'https://www.wikidata.org/wiki/'
    url_qids = url+qids
    ru =r.Request(url_qids,headers={'Connection': 'close'})
    page = urlopen(ru).read()
    soup = BeautifulSoup(page)
    label = soup.find("span", {"class":"wikibase-title-label"}).text
    return label 

#### Religion QIDS - label

In [None]:
religion_qids = [x for x in religion_qids if x!=None]
df_religion = pd.DataFrame()
df_religion['qids'] = religion_qids
df_religion['label'] = df_religion['qids'].apply(lambda x : find_label(x))

In [None]:
df_religion.to_json('data/religion.json')

#### Nationality QIDS - label

In [None]:
nationality_qids = [x for x in nationality_qids if x!=None]
df_nationality = pd.DataFrame()
df_nationality['qids'] = nationality_qids
df_nationality['label'] = df_nationality['qids'].apply(lambda x : find_label(x))

In [None]:
df_nationality.to_json('data/nationality.json')

#### Ethnic QIDS - label

In [None]:
ethnic_qids = [x for x in ethnic_qids if x!=None]
df_ethnic = pd.DataFrame()
df_ethnic['ethnic_qids'] = ethnic_qids
df_ethnic['ethnic_label'] = df_ethnic['ethnic_qids'].apply(lambda x : find_label(x))

In [None]:
df_ethnic.to_json('data/ethnic.json')

#### Party QIDS - label 
- n° of party: 9'632
- Problem: Q99761286 - <Response [404]>

In [None]:
party_qids = [x for x in party_qids if x!=None]
df_party = pd.DataFrame()
df_party['party_qids'] = party_qids
df_party = df_party[df_party.party_qids != 'Q99761286']
df_party['party_label'] = df_party['party_qids'].apply(lambda x : find_label(x))

In [None]:
df_party.to_json('data/party.json')

#### Occupation QIDS - label
- Need to split in 3 arrays in order to be more efficient
- 'Q57557390', 'Q102046591', 'Q98384826', 'Q105645755', 'Q99753484','Q56411328', 'Q96143085', 'Q96144081', 'Q3738699', 'Q5568256'

In [None]:
occupation_qids = [x for x in occupation_qids if x!=None]
l = len(occupation_qids)
l1 = 500
l2 = 1000
occupation_qids_1 = occupation_qids[:l1]
occupation_qids_2 = occupation_qids[l1:l2]
occupation_qids_3 = occupation_qids[1000:5000]
occupation_qids_4 = occupation_qids[5000:10000]
occupation_qids_5 = occupation_qids[10000:]
a = ['Q57557390', 'Q102046591', 'Q98384826', 'Q105645755', 'Q99753484','Q56411328','Q96143085', 'Q96144081','Q3738699','Q5568256']
occupation_qids_3 = [x for x in occupation_qids_3 if x not in a ]
occupation_qids_4 = [x for x in occupation_qids_4 if x not in a ]
occupation_qids_5 = [x for x in occupation_qids_5 if x not in a ]

In [None]:
df_occupation = pd.DataFrame()
df_occupation['occupation_qids'] = occupation_qids_5
df_occupation['occupation_label'] = df_occupation['occupation_qids'].apply(lambda x : find_label(x))
df_occupation.to_json('data/occupation5.json')

In [None]:
df1 = pd.read_json('data/occupation1.json')
df2 = pd.read_json('data/occupation2.json')
df3 = pd.read_json('data/occupation3.json')
df4 = pd.read_json('data/occupation4.json')
df5 = pd.read_json('data/occupation5.json')
df = pd.concat([df1,df2,df3])