In [1]:
import bz2
import json
import numpy as np
import pandas as pd
import spacy
import pickle
from pathlib import Path
from collections import defaultdict
from datetime import datetime

from IPython.display import display, HTML

from ressources import config

In [2]:
RAW_DATA_FOLDER = config.RAW_DATA_FOLDER
GENERATED_DATA_FOLDER = config.GENERATED_DATA_FOLDER

## WIKIDATA Speaker Dataset
We got two wikidata datasets. The following section deal with the first dataset. The first one contains informations about all the speaker from quotebank. From this one we want to extract the gender of the speakers based on their qid

In [None]:
# opening the parquet data
wiki = pd.read_parquet(RAW_DATA_FOLDER / 'WIKIDATA')
wiki = wiki[['id','aliases', 'label','date_of_birth','gender']]
# Wikidata is unique so there's no double
print(wiki['id'].is_unique)
print(wiki.shape)
wiki.sample(2)

In [None]:
# Function to get rid of list in a column

def try_join(cell):
    try:
        return ' '.join(map(str, cell))
    except TypeError:
        return np.nan

In [6]:
# converting gender column (type: list / object) to a str
wiki['genderlabel'] = [try_join(cell) for cell in wiki['gender']]

We want to convert the gender id to str (Q6581097 = male and Q6581072 = female). If the gender is not one of these two, we mark it as other. We are aware that if the person has noted two of more genders, it would return "other" but it is a really small percentage when we applied that to the quotebank speakers

In [7]:
# We want to convert the id to str
gender_dict = defaultdict(lambda : 'other')
gender_dict.update({"Q6581097" : 'male', "Q6581072" : 'female'})
wiki['genderlabel'] = wiki['genderlabel'].map(gender_dict)

In [8]:
# Check we only have male female and other in Wikidata
wiki['genderlabel'].value_counts()

male      5418464
other     1953347
female    1684170
Name: genderlabel, dtype: int64

Next step is to parse the date of birth. There are quite a lot of errors in the dataset, so we had to handle it the best we could with the funciton parse date

In [9]:
# converting date_of_birth column (type: list / object) to a str
wiki['date_of_birth'] = [try_join(cell) for cell in wiki['date_of_birth']]

In [10]:
# parse date and time to extract only date and calculate age of the speaker
wiki['date_of_birth'].head(5)

0    +1732-02-22T00:00:00Z
1    +1952-03-11T00:00:00Z
2    +1868-08-23T00:00:00Z
3    +1946-07-06T00:00:00Z
4    +1599-06-06T00:00:00Z
Name: date_of_birth, dtype: object

In [11]:
def parse_date(date):
    try:
        if not isinstance(date, str):
            return np.NaN
        elif len(date)%21 != 0:
            return np.NaN
        elif date == 'None' or date[0] == '-':
            return np.NaN
        elif '+0000-00-00T00:00:00Z' in date[0:21] :
            return '01.01.0001'
        elif '-00-00T00:00:00' in date[0:21] :
            return datetime.strptime(date[0:21], '+%Y-00-00T00:00:00Z').strftime('%d.%m.%Y')
        elif '-00T00:00:00' in date[0:21] :
            return datetime.strptime(date[0:21], '+%Y-%m-00T00:00:00Z').strftime('%d.%m.%Y')
        elif '+1939-02-29T00:00:00Z' in date:
            return '28.02.1939'
        else :
            return datetime.strptime(date[0:21], '+%Y-%m-%dT%H:%M:%SZ').strftime('%d.%m.%Y')
    except ValueError as e:
        if str(e) == 'day is out of range for month':
            return datetime.strptime(date[0:8], '+%Y-%m').strftime('%d.%m.%Y')
        else:
            print(date)
            raise e

In [12]:
wiki['dob_std'] = wiki['date_of_birth'].map(parse_date)

In [13]:
wiki['dob_std'].head()

0    22.02.1732
1    11.03.1952
2    23.08.1868
3    06.07.1946
4    06.06.1599
Name: dob_std, dtype: object

In [25]:
wiki.drop(columns = 'gender', inplace=True)
list(wiki)

['id', 'aliases', 'label', 'genderlabel', 'dob_std']

In [26]:
wiki

Unnamed: 0,id,aliases,label,genderlabel,dob_std
0,Q23,"[Washington, President Washington, G. Washingt...",George Washington,male,22.02.1732
1,Q42,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",Douglas Adams,male,11.03.1952
2,Q1868,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",Paul Otlet,male,23.08.1868
3,Q207,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",George W. Bush,male,06.07.1946
4,Q297,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",Diego Velázquez,male,06.06.1599
...,...,...,...,...,...
9055976,Q106406560,[Barker Howard],Barker B. Howard,male,
9055977,Q106406571,[Charles Macomber],Charles H. Macomber,male,
9055978,Q106406588,,Dina David,female,01.04.1848
9055979,Q106406593,,Irma Dexinger,female,18.03.1899


In [35]:
#save wiki into pickle
wiki.to_pickle('../generated/WIKIDATA/df_wikidata_speakers.pickle')

In [None]:
#with open('../generated/WIKIDATA/df_wikidata_final.pickle', 'wb') as f: 
    #pickle.dump(wiki, f)

In [5]:
#open pickle file (just to make sure it's properly saved)
with open('../generated/WIKIDATA/df_wikidata_speakers.pickle', 'rb') as f: 
     df_wikidata = pickle.loads(f.read())

In [6]:
df_wikidata

Unnamed: 0,id,aliases,label,genderlabel,dob_std
0,Q23,"[Washington, President Washington, G. Washingt...",George Washington,male,22.02.1732
1,Q42,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",Douglas Adams,male,11.03.1952
2,Q1868,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",Paul Otlet,male,23.08.1868
3,Q207,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",George W. Bush,male,06.07.1946
4,Q297,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",Diego Velázquez,male,06.06.1599
...,...,...,...,...,...
9055976,Q106406560,[Barker Howard],Barker B. Howard,male,
9055977,Q106406571,[Charles Macomber],Charles H. Macomber,male,
9055978,Q106406588,,Dina David,female,01.04.1848
9055979,Q106406593,,Irma Dexinger,female,18.03.1899


In [72]:
#keep columns of interest and change index
df_wikidata_dico = df_wikidata.drop(columns = 'aliases')
df_wikidata_dico.set_index('id',inplace = True)

In [73]:
#create a dictionary with index as key
dict_wikidata_people = df_wikidata_dico.to_dict('index')

In [76]:
#save into pickle
with open('../generated/WIKIDATA/dict_wikidata_people.pickle', 'wb') as g:
    pickle.dump(dict_wikidata_people, g, protocol=pickle.HIGHEST_PROTOCOL)

# Parsing WIKIDATA for realease date of film
This section parse the wkidata dataset in order to get release date of films and gender of actors/crew based on IMDb id

In [None]:
import gzip
import pickle
import bz2
import json

path_to_file = '../data/wikidata-20211122-all.json.gz'
path_to_out_nconst = '../temp/QUOTEBANK/nconst_dict.bz2.json'
path_to_out_tconst = '../temp/QUOTEBANK/tconst_dict.bz2.json'
with open('../generated/IMDb/nconst_list.pickle', 'rb') as f:
    nconst_list = pickle.loads(f.read())
with open('../generated/IMDb/tconst_list.pickle', 'rb') as f:
    tconst_list = pickle.loads(f.read())

print("Starting")

n, m = 0, 0
# Do not enforce encoding here since the input encoding is correct
with bz2.open(path_to_out_nconst, 'wb') as nconst_file:
    with bz2.open(path_to_out_tconst, 'wb') as tconst_file:
        with gzip.open(path_to_file, 'rb') as s_file:
            for instance in s_file:
                instance = instance.decode('utf-8')
                instance = instance[:-2]
                if len(instance) == 0:
                    continue
                try:
                    s = json.loads(instance.strip("\n"))
                except:
                    continue

                if s.get("labels", {}).get("en") is not None:
                    s["label"] = s["labels"]["en"]["value"]
                if s.get("labels") is not None:
                    del s["labels"]
                else:
                    continue

                # Occupation
                if len(s.get("claims", {}).get("P345", [])) > 0:
                    result = {}
                    try:
                        ID = s["claims"]["P345"][0]['mainsnak']['datavalue']['value']
                    except KeyError:
                        continue
                    if ID in nconst_list:
                        n += 1
                        result['nconst'] = ID
                        try:
                            gender = s["claims"]["P21"][0]['mainsnak']['datavalue']['value']['id']
                        except KeyError:
                            result['gender'] = None
                        else:
                            result['gender'] = gender
                        try:
                            dob = s["claims"]["P569"][0]['mainsnak']['datavalue']['value']['time']
                        except KeyError:
                            result['dob'] = None
                        else:
                            result['dob'] = dob
                        nconst_file.write((json.dumps(result) + '\n').encode('utf-8'))
                    elif ID in tconst_list:
                        m += 1
                        result['tconst'] = ID
                        try:
                            for e in s["claims"]["P577"]:
                                if 'qualifiers' in e.keys():
                                    for f in e['qualifiers']['P291']:
                                        if f['datavalue']['value']['id'] == 'Q30':
                                            result['publicy_place'] = 'Q30'
                                            result['publicy_date'] = e['mainsnak']['datavalue']['value']['time']

                            if 'publicy_date' not in result.keys():
                                result['publicy_place'] = 'Unknown'
                                result['publicy_date'] = s["claims"]["P577"][0]['mainsnak']['datavalue']['value']['time']
                        except KeyError:
                            result['publicy_place'] = None
                            result['publicy_date'] = None
                        tconst_file.write((json.dumps(result) + '\n').encode('utf-8'))


print("Finish")