In [1]:
import bz2
import json
import numpy as np
import pandas as pd
import spacy
import pickle
from pathlib import Path
from collections import defaultdict
from datetime import datetime

from IPython.display import display, HTML

from ressources import config

In [2]:
RAW_DATA_FOLDER = config.RAW_DATA_FOLDER
GENERATED_DATA_FOLDER = config.GENERATED_DATA_FOLDER

## WIKIDATA Dataset
Exploring and filtering of Wikidata Dataset

In [3]:
# opening the parquet data
wiki = pd.read_parquet(RAW_DATA_FOLDER / 'WIKIDATA')
wiki = wiki[['id','aliases', 'label','date_of_birth','gender']]
# Wikidata is unique so there's no double
print(wiki['id'].is_unique)
print(wiki.shape)
wiki.sample(2)

True
(9055981, 5)


Unnamed: 0,id,aliases,label,date_of_birth,gender
2081222,Q95333820,,Andreas Loder,,[Q6581097]
5788828,Q4786824,,Archie Whyte,[+1919-07-17T00:00:00Z],[Q6581097]


In [4]:
wiki.sample(2)

Unnamed: 0,id,aliases,label,date_of_birth,gender
3166473,Q92947683,,Umanga De Silva,,
6673722,Q98613747,,I Cori Baill,,


In [5]:
# Function to get rid of list in a column

def try_join(cell):
    try:
        return ' '.join(map(str, cell))
    except TypeError:
        return np.nan

In [6]:
# converting gender column (type: list / object) to a str
wiki['genderlabel'] = [try_join(cell) for cell in wiki['gender']]

In [7]:
gender_dict = defaultdict(lambda : 'other')
gender_dict.update({"Q6581097" : 'male', "Q6581072" : 'female'})
wiki['genderlabel'] = wiki['genderlabel'].map(gender_dict)

In [8]:
# Check we only have male female and other in Wikidata
wiki['genderlabel'].value_counts()

male      5418464
other     1953347
female    1684170
Name: genderlabel, dtype: int64

In [9]:
# converting date_of_birth column (type: list / object) to a str
wiki['date_of_birth'] = [try_join(cell) for cell in wiki['date_of_birth']]

In [10]:
# TODO: parse date and time to extract only date and calculate age of the speaker
wiki['date_of_birth'].head(5)

0    +1732-02-22T00:00:00Z
1    +1952-03-11T00:00:00Z
2    +1868-08-23T00:00:00Z
3    +1946-07-06T00:00:00Z
4    +1599-06-06T00:00:00Z
Name: date_of_birth, dtype: object

In [11]:
def parse_date(date):
    try:
        if not isinstance(date, str):
            return np.NaN
        elif len(date)%21 != 0:
            return np.NaN
        elif date == 'None' or date[0] == '-':
            return np.NaN
        elif '+0000-00-00T00:00:00Z' in date[0:21] :
            return '01.01.0001'
        elif '-00-00T00:00:00' in date[0:21] :
            return datetime.strptime(date[0:21], '+%Y-00-00T00:00:00Z').strftime('%d.%m.%Y')
        elif '-00T00:00:00' in date[0:21] :
            return datetime.strptime(date[0:21], '+%Y-%m-00T00:00:00Z').strftime('%d.%m.%Y')
        elif '+1939-02-29T00:00:00Z' in date:
            return '28.02.1939'
        else :
            return datetime.strptime(date[0:21], '+%Y-%m-%dT%H:%M:%SZ').strftime('%d.%m.%Y')
    except ValueError as e:
        if str(e) == 'day is out of range for month':
            return datetime.strptime(date[0:8], '+%Y-%m').strftime('%d.%m.%Y')
        else:
            print(date)
            raise e

In [12]:
wiki['dob_std'] = wiki['date_of_birth'].map(parse_date)

In [13]:
wiki['dob_std'].head()

0    22.02.1732
1    11.03.1952
2    23.08.1868
3    06.07.1946
4    06.06.1599
Name: dob_std, dtype: object

In [14]:
wiki['dob_std'].isna().sum()

4149336

In [16]:
#wiki['dob_std_auto'] = pd.to_datetime(wiki['date_of_birth'], errors = 'coerce', infer_datetime_format = True, exact = False)

In [17]:
#wiki['dob_std_auto'].isna().sum()

9055981

In [19]:
wiki['dob_std_auto'].value_counts()

Series([], Name: dob_std_auto, dtype: int64)

In [25]:
wiki.drop(columns = 'gender', inplace=True)
list(wiki)

['id', 'aliases', 'label', 'genderlabel', 'dob_std']

In [26]:
wiki

Unnamed: 0,id,aliases,label,genderlabel,dob_std
0,Q23,"[Washington, President Washington, G. Washingt...",George Washington,male,22.02.1732
1,Q42,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",Douglas Adams,male,11.03.1952
2,Q1868,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",Paul Otlet,male,23.08.1868
3,Q207,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",George W. Bush,male,06.07.1946
4,Q297,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",Diego Velázquez,male,06.06.1599
...,...,...,...,...,...
9055976,Q106406560,[Barker Howard],Barker B. Howard,male,
9055977,Q106406571,[Charles Macomber],Charles H. Macomber,male,
9055978,Q106406588,,Dina David,female,01.04.1848
9055979,Q106406593,,Irma Dexinger,female,18.03.1899


In [35]:
wiki.to_pickle('../generated/WIKIDATA/df_wikidata_final.pickle')

In [None]:
#with open('../generated/WIKIDATA/df_wikidata_final.pickle', 'wb') as f: 
    #pickle.dump(wiki, f)

In [36]:
with open('../generated/WIKIDATA/df_wikidata_final.pickle', 'rb') as f: 
     df_wikidata = pickle.loads(f.read())

In [37]:
df_wikidata

Unnamed: 0,id,aliases,label,genderlabel,dob_std
0,Q23,"[Washington, President Washington, G. Washingt...",George Washington,male,22.02.1732
1,Q42,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",Douglas Adams,male,11.03.1952
2,Q1868,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",Paul Otlet,male,23.08.1868
3,Q207,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",George W. Bush,male,06.07.1946
4,Q297,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",Diego Velázquez,male,06.06.1599
...,...,...,...,...,...
9055976,Q106406560,[Barker Howard],Barker B. Howard,male,
9055977,Q106406571,[Charles Macomber],Charles H. Macomber,male,
9055978,Q106406588,,Dina David,female,01.04.1848
9055979,Q106406593,,Irma Dexinger,female,18.03.1899
