In [43]:
import sys

sys.path.append('../')

import pandas as pd

from dotenv import load_dotenv

load_dotenv()
import os

import sqlite3
import re
import seaborn as sns
from data_model import Individual
from sys_utils import load_model

In [44]:
WIKIDATA_RAW_DATA = os.getenv("WIKIDATA_RAW_DATA")
MANUAL_DATA = os.getenv("MANUAL_DATA")
CHECKPOINT_PATH = os.getenv("CHECKPOINT_PATH")
DB_ENV = os.getenv("DB_PATH")

In [45]:
individuals = load_model(
        Individual, name=CHECKPOINT_PATH + "/individuals.jsonl"
    )

In [46]:
conn = sqlite3.connect('/cultura.db')

# create a cursor object
cursor = conn.cursor()

# get a list of all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = cursor.fetchall()

# drop each table in the database
for table in tables:
    cursor.execute(f"DROP TABLE IF EXISTS {table[0]}")

# commit the changes and close the connection
conn.commit()

#### Move Individuals

In [47]:
individuals[1]

Individual(id=RawIndividual(wikidata_id='Q6121397', name='Santos Domínguez y Benguria', birthyear=1841, gender=['male'], raw_nationalities=[RawNationality(wikidata_id='Q29', name='Spain', location='Point(-3.5 40.2)')], raw_birthcities=None, occupations=[Occupation(wikidata_id='Q42973', name='architect', category=['artist'])]), impact_years=(1850, 1900), cultural_score=0.10024749047065873, country=Country(name='Spain', code='es'), wikipedia_pages=[WikipediaPage(url='https://es.wikipedia.org/wiki/Santos_Dom%C3%ADnguez_y_Benguria', language='es', links_ext_count=5, links_out_count=25, links_in_count=9, author='Doctor seisdedos', author_editcount=2986, editors=16, minor_edits=9, revisions=26, pageviews=23, characters=1187, references=0, unique_references=0, words=199, created_at='2006-11-19')], regions=['re_western_europe', 're_spain'], identifiers=[ExternalID(wikidata_id='P213', name='ISNI'), ExternalID(wikidata_id='P214', name='VIAF ID'), ExternalID(wikidata_id='P244', name='Library of C

In [48]:
df_fil = [
    {"individual_wikidata_id":x.id.wikidata_id,
        "individual_name": x.id.name,
        "wikipedia_cultural_score":x.cultural_score if x.cultural_score is not None else None,
        "gender": x.id.gender,
        "birthyear": x.id.birthyear,
        "individual_impact_years": x.impact_years,
        "nationality_wikidata_id": [y.wikidata_id for y in x.id.raw_nationalities] if x.id.raw_nationalities is not None else [],
        "nationality_name": [y.name for y in x.id.raw_nationalities] if x.id.raw_nationalities is not None else [],
        "nationality_location": [y.location for y in x.id.raw_nationalities] if x.id.raw_nationalities is not None else [],
        "occupations_wikidata_id": [y.wikidata_id for y in x.id.occupations] if x.id.occupations is not None else [],
        "birthcity_wikidata_id": [y.wikidata_id for y in x.id.raw_birthcities] if x.id.raw_birthcities is not None else [],
         "birthcity_name": [y.name for y in x.id.raw_birthcities] if x.id.raw_birthcities is not None else [],
        "birthcity_country_wikidata_id": [y.country_wikidata_id for y in x.id.raw_birthcities] if x.id.raw_birthcities is not None else [],
        "birthcity_country_name": [y.country_name for y in x.id.raw_birthcities] if x.id.raw_birthcities is not None else [],
        "birthcity_country_location": [y.country_location for y in x.id.raw_birthcities] if x.id.raw_birthcities is not None else [],     
        "country_code":x.country.code if x.country is not None else None,
        "country_name":x.country.name if x.country is not None else None,
        "region_code": [y for y in x.regions] if x.regions is not None else [],
        "identifiers_wikidata_id":[y.wikidata_id for y in x.identifiers] if x.identifiers is not None else [],
        "wikipedia_page_url": [y.url for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else []
     
         
    }
    for x in individuals
]
df_fil = pd.DataFrame(df_fil)


pattern = re.compile('^Q\d')
df_fil = df_fil[~df_fil['individual_name'].apply(lambda x: bool(pattern.match(x)))]

In [49]:
# Move Gender

df_insert = df_fil[['individual_wikidata_id', 'individual_name', 'gender']]
df_insert = df_insert.explode('gender')
df_insert = df_insert.dropna()
df_insert = df_insert[~df_insert['gender'].str.startswith('http')]
df_insert = df_insert.drop_duplicates().reset_index(drop=True)
df_insert.to_sql(f'individual_gender', conn, if_exists = 'replace', index=False)

218085

In [50]:
# Move nationality

df_insert = df_fil[['individual_wikidata_id', 'individual_name','nationality_wikidata_id', 'nationality_name', 'nationality_location']]
df_insert = df_insert.explode(['nationality_wikidata_id', 'nationality_name', 'nationality_location'])
df_insert = df_insert.dropna(subset=['nationality_wikidata_id'])
df_insert['nationality_location'][df_insert['nationality_location']=='nan'] = None

df_insert['nationality_longitude'] = df_insert['nationality_location'].apply(lambda x: x.split(' ')[0].split('(')[1] if x is not None else None).astype(float)
df_insert['nationality_latitude'] = df_insert['nationality_location'].apply(lambda x: x.split(' ')[1].split(')')[0]  if x is not None else None).astype(float)
df_insert.to_sql(f'individual_nationality', conn, if_exists = 'replace', index=False)

148543

In [51]:
# move birthcity

In [52]:
df_ind_birthcity = df_fil[['individual_wikidata_id', 'individual_name', 'birthcity_wikidata_id', 'birthcity_name']]
df_ind_birthcity = df_ind_birthcity.explode(['birthcity_wikidata_id', 'birthcity_name'])
df_ind_birthcity = df_ind_birthcity.dropna()
df_ind_birthcity.to_sql(f'individual_birthcity', conn, if_exists = 'replace', index=False)

147828

In [53]:
# add birthcity meta_data

In [54]:
df_birthcity = df_fil[['birthcity_wikidata_id', 'birthcity_name', 'birthcity_country_wikidata_id', 'birthcity_country_name', 'birthcity_country_location']]
cols = ['birthcity_wikidata_id', 'birthcity_name', 'birthcity_country_wikidata_id', 'birthcity_country_name', 'birthcity_country_location']
df_birthcity = df_birthcity.explode(cols)
df_birthcity = df_birthcity.drop_duplicates()
df_birthcity = df_birthcity.dropna(subset=['birthcity_wikidata_id'])
df_birthcity['birthcity_country_location'][df_birthcity['birthcity_country_location']=='nan'] = None
df_birthcity['country_longitude'] = df_birthcity['birthcity_country_location'].apply(lambda x: x.split(' ')[0].split('(')[1] if x is not None else None).astype(float)
df_birthcity['country_latitude'] = df_birthcity['birthcity_country_location'].apply(lambda x: x.split(' ')[1].split(')')[0]  if x is not None else None).astype(float)
df_birthcity.to_sql(f'birthcity', conn, if_exists = 'replace', index=False)

34072

In [55]:
# add idenfiers

In [56]:
df_insert = df_fil[["individual_wikidata_id",'individual_name', 'identifiers_wikidata_id']]
df_insert = df_insert.explode('identifiers_wikidata_id')
df_insert = df_insert.dropna().reset_index(drop=True)

In [57]:
df_id = pd.read_csv(WIKIDATA_RAW_DATA + '/identifiers.csv')
df_id['property'] = df_id['property'].apply(lambda x : x.split('entity/')[1])
df_id['country'] = df_id['country'].apply(lambda x: x.split('entity/')[1] if (x is not None) and (not isinstance(x, float)) else None)
df_id['count_records'][df_id['count_records'].fillna("").str.contains('http')] = None
df_id['count_records'] = df_id['count_records'].astype(float)
df_id = df_id.rename(columns = {'property':'identifiers_wikidata_id'})
df_id = df_id.rename(columns = {'country':'country_wikidata_id'})
df_id = df_id.rename(columns = {'source_url':'identifier_url'})
df_id = df_id.rename(columns = {'count_records':'count_records'})
df_id = df_id.rename(columns = {'countryLabel':'country_name'})
df_id = df_id.rename(columns = {'propertyLabel':'identifier_name'})
df_id = df_id.drop_duplicates('identifiers_wikidata_id', keep='first')
df_id['identifier_name_country'] = df_id.fillna("").apply(lambda x : x['identifier_name'] + ' ('  + x['country_name'] + ')' if x['country_name']!='' else x['identifier_name'], axis=1)
df_id.to_sql('identifiers', conn, if_exists = 'replace', index=False)



8022

In [58]:
# add the rest of identifiers

In [59]:
import json

with open(WIKIDATA_RAW_DATA + '/external_identifiers_missing.json', 'r') as f:
    data_rest = json.load(f)

In [60]:
data_fil = [x for x in data_rest if x != []]
data_fil = [x for x in data_fil if x is not None]

In [61]:
individual_wikidata_id = [x[0]['p']['value'] for x in data_fil]
individual_name = [x[0]['pLabel']['value'] for x in data_fil]

identifiers_wikidata_id = [[y['p']['value'] for y in x[1:]] for x in data_fil]
identifier_name = [[y['pLabel']['value'] for y in x[1:]] for x in data_fil]

df_rest =  pd.DataFrame({'individual_wikidata_id':individual_wikidata_id, 
                         'individual_name':individual_name, 
                         'identifiers_wikidata_id': identifiers_wikidata_id,
                         'identifier_name':identifier_name
                     
                       })
df_rest = df_rest.explode(['identifiers_wikidata_id', 'identifier_name'])
df_rest['individual_wikidata_id'] = df_rest['individual_wikidata_id'].apply(lambda x : x.split('/entity/')[1])
df_rest = df_rest.dropna()
df_rest['identifiers_wikidata_id'] = df_rest['identifiers_wikidata_id'].apply(lambda x : x.split('/entity/')[1] if ('entity' in x and x is not None) else None)
df_rest = df_rest[~df_rest['individual_wikidata_id'].str.startswith('P')]
df_rest = df_rest.drop(['identifier_name'], axis=1)

final_insert = pd.concat([df_insert, df_rest])

In [62]:
df_final_id = pd.merge(final_insert, df_id[['identifiers_wikidata_id', 'identifier_name']], on = 'identifiers_wikidata_id')
df_final_id = df_final_id.drop_duplicates()
df_final_id.to_sql(f'individual_identifiers', conn, if_exists = 'replace', index=False)

3003272

In [63]:
df_count_id = df_final_id.groupby(['individual_wikidata_id', 'individual_name'])['identifiers_wikidata_id'].count().rename('identifier_score').reset_index()

#### Add Wikipedia Page

In [64]:
df_wikipedia = [
    {"individual_wikidata_id":x.id.wikidata_id,
    "individual_name": x.id.name,
    "wikipedia_page_url": [y.url for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "language": [y.language for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "links_ext_count": [y.links_ext_count for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "links_out_count": [y.links_out_count for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "links_in_count": [y.links_in_count for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "author": [y.author for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "author_editcount": [y.author_editcount for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "editors": [y.editors for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "minor_edits": [y.minor_edits for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "revisions": [y.revisions for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "pageviews": [y.pageviews for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "characters": [y.characters for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "references": [y.references for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "unique_references": [y.unique_references for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "words": [y.words for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else [],
    "created_at": [y.created_at for y in x.wikipedia_pages] if  x.wikipedia_pages is not None else []
    } 
    
    for x in individuals]

df_wikipedia = pd.DataFrame(df_wikipedia)
pattern = re.compile('^Q\d')
df_wikipedia = df_wikipedia[~df_wikipedia['individual_name'].apply(lambda x: bool(pattern.match(x)))]

In [65]:
df_wikipedia = df_wikipedia.explode(['wikipedia_page_url',
       'language', 'links_ext_count', 'links_out_count', 'links_in_count',
       'author', 'author_editcount', 'editors', 'minor_edits', 'revisions',
       'pageviews', 'characters', 'references', 'unique_references', 'words',
       'created_at'])

df_wikipedia = df_wikipedia.dropna(subset=['wikipedia_page_url'])
df_wikipedia = df_wikipedia[df_wikipedia['wikipedia_page_url'].str.contains('wikipedia.org')]
df_wikipedia = df_wikipedia.rename(columns = {'language':'language_code'})

df_wiki_language = pd.read_csv(MANUAL_DATA + '/Wikipedia_language - code.csv')
df_wiki_language = df_wiki_language.rename(columns = {'code':'language_code'})

df_wikipedia = pd.merge(df_wikipedia, df_wiki_language, on = 'language_code')

df_wikipedia = df_wikipedia.sort_values('individual_wikidata_id')
df_wikipedia.to_sql('individual_wikipedia', conn, if_exists = 'replace', index=False)

699019

#### Occupation

In [66]:
individuals[0]

Individual(id=RawIndividual(wikidata_id='Q77297362', name='Jean Jacques Morgan', birthyear=1756, gender=['male'], raw_nationalities=[RawNationality(wikidata_id='Q142', name='France', location='Point(2.0 47.0)')], raw_birthcities=None, occupations=[Occupation(wikidata_id='Q1281618', name='sculptor', category=['artist'])]), impact_years=(1770, 1820), cultural_score=None, country=Country(name='France', code='fr'), wikipedia_pages=None, regions=['re_northwestern_europe', 're_western_europe', 're_france'], identifiers=None)

In [67]:
df_occupations = [
    {"individual_wikidata_id":x.id.wikidata_id,
        "individual_name": x.id.name,
        "occupations_wikidata_id": [y.wikidata_id for y in x.id.occupations] if x.id.occupations is not None else [],
       "occupations_name": [y.name for y in x.id.occupations] if x.id.occupations is not None else [],
     "occupations_category": [y.category for y in x.id.occupations] if x.id.occupations is not None else [],
         
    }
    for x in individuals
]
df_occupations = pd.DataFrame(df_occupations)

pattern = re.compile('^Q\d')
df_occupations = df_occupations[~df_occupations['individual_name'].apply(lambda x: bool(pattern.match(x)))]
df_occupations = df_occupations.explode(['occupations_wikidata_id', 'occupations_name', 'occupations_category'])
df_occupations['occupations_category'] = df_occupations['occupations_category'].apply(lambda x: "|".join(x))
df_occupations = df_occupations[~df_occupations['occupations_name'].apply(lambda x: bool(pattern.match(x)))]
df_occupations = df_occupations.drop_duplicates().reset_index(drop=True)
df_occupations.to_sql(f'individual_occupations', conn, if_exists = 'replace', index=False)

319563

#### Main Individuals information

In [68]:
df_ind = df_fil[['individual_wikidata_id', 'individual_name', 'birthyear', 'individual_impact_years', 'country_code', 'country_name', 'wikipedia_cultural_score']].copy()
df_ind['individual_impact_years'] = df_ind['individual_impact_years'].apply(lambda x: str(x[0]) + '-' + str(x[1]))
df_ind = pd.merge(df_ind, df_count_id, on = ['individual_wikidata_id', 'individual_name'], how = 'outer')

In [69]:
df_ind.to_sql(f'individuals_main_information', conn, if_exists = 'replace', index=False)

220770

In [70]:
test = df_ind[~df_ind['identifier_score'].isna()]

In [71]:
df_ind.sort_values('identifier_score', ascending=False).sample(10)

Unnamed: 0,individual_wikidata_id,individual_name,birthyear,individual_impact_years,country_code,country_name,wikipedia_cultural_score,identifier_score
43914,Q11668738,Kōzan Takai,1806.0,1820-1870,jp,Japan,0.100411,11.0
10728,Q32227,Alfred Malherbe,1804.0,1810-1860,mu,Mauritius,0.104475,21.0
99309,Q17352254,Luc-Vincent Thiéry,1734.0,1740-1790,fr,France,0.100114,15.0
168739,Q55281046,Alessandro Romani,1800.0,1810-1860,it,Italy,0.100245,10.0
220331,Q15821313,Johann Ketzmann,1487.0,1500-1550,de,Germany,0.100143,7.0
17265,Q96364395,Urs Füeg,1671.0,1680-1730,ch,Switzerland,,8.0
115248,Q50842915,Ignác Klimkovič,1800.0,1810-1860,sk,Slovakia,0.100262,6.0
125957,Q102234,Balthasar Bebel,1632.0,1640-1690,fr,France,0.100526,24.0
37654,Q2073203,Annibale Stabile,1535.0,1550-1600,it,Italy,0.103332,23.0
20666,Q28790249,Alexis-Hervé-Jacques de Rougé,1841.0,1850-1900,,,,9.0


#### Add Indiviudal region_code

In [72]:
from data_model_region import Region

In [74]:
regions = load_model(
    Region, name=CHECKPOINT_PATH + "/regions.jsonl"
)

df_regions = [
        {
            "region_code": x.code,
            "region_name": x.name,
        }
        for x in regions
    ]

df_regions = pd.DataFrame(df_regions)

In [75]:
df_ind_region = df_fil[['individual_wikidata_id', 'individual_name', 'region_code']].copy()
df_ind_region = df_ind_region.explode('region_code')
df_ind_region = df_ind_region.dropna()
df_ind_region = pd.merge(df_ind_region, df_regions, on = 'region_code')

In [76]:
df_ind_region.to_sql(f'individuals_regions', conn, if_exists = 'replace', index=False)

571305

#### Add notable work

In [77]:
import json

with open(WIKIDATA_RAW_DATA + '/notable_work.json', 'r') as f:
    data_notable = json.load(f)
    
data_fil = [x for x in data_notable if x != []]

In [78]:
wikidata_indi = [[y['subject']['value'] for y in x] if x is not None else [] for x in data_fil]
wikidata_indi_label = [[y['subjectLabel']['value'] for y in x] if x is not None else [] for x in data_fil]
wikidata_work = [[y['work']['value'] for y in x] if x is not None else []  for x in data_fil]
wikidata_work_label = [[y['workLabel']['value'] for y in x] if x is not None else []  for x in data_fil]
instance_label = [[y.get('instanceLabel', {}).get('value', None) for y in x] if x is not None else []  for x in data_fil]
inception = [[y.get('inception', {}).get('value') for y in x] if x is not None else [] for x in data_fil]

In [79]:
df =  pd.DataFrame({'individual_wikidata_id':wikidata_indi, 
                     'individual_name':wikidata_indi_label, 
                     'notable_work_wikidata_id':wikidata_work,
                     'notable_work_name':wikidata_work_label,
                     'instance_label':instance_label,
                    'inception':inception
                   
                   })

df = df.apply(lambda col: col.explode())
df = df.dropna(subset=['individual_wikidata_id'])
df['individual_wikidata_id'] = df['individual_wikidata_id'].apply(lambda x : x.split('/entity/')[1])
df['notable_work_wikidata_id'] = df['notable_work_wikidata_id'].apply(lambda x : x.split('/entity/')[1] if 'entity' in x else None)


pattern = re.compile('^Q\d')
df = df[~df['individual_name'].apply(lambda x: bool(pattern.match(x)))]
df = df[~df['notable_work_name'].apply(lambda x: bool(pattern.match(x)))]

In [80]:
def clean_date(raw_date):
    try:
        if raw_date.startswith("-"):
            clean_date = int(raw_date[:5])
        else:
            clean_date = int(raw_date[:4])

    except:
        clean_date = None
    return clean_date

In [81]:
df_notable_work = df[['notable_work_wikidata_id', 'notable_work_name', 'instance_label', 'inception']].drop_duplicates()
df_notable_work['year'] = df_notable_work['inception'].apply(lambda x : clean_date(x))
df_notable_work.to_sql(f'notable_work', conn, if_exists = 'replace', index=False)

20238

#### Add authors and Creators

In [82]:
import json

with open(WIKIDATA_RAW_DATA + '/work_as_creator_or_author.json', 'r') as f:
    data_creator = json.load(f)
    
data_fil = [x for x in data_creator if x != []]
data_fil = [x for x in data_fil if x is not None]

wikidata_indi = [[y['subject']['value'] for y in x] if x is not None else [] for x in data_fil]
wikidata_indi_label = [[y['subjectLabel']['value'] for y in x] if x is not None else [] for x in data_fil]
wikidata_object = [[y['object']['value'] for y in x] if x is not None else []  for x in data_fil]
wikidata_objectlabel = [[y['objectLabel']['value'] for y in x] if x is not None else []  for x in data_fil]
instance_label = [[y.get('instanceLabel', {}).get('value') for y in x] if x is not None else []  for x in data_fil]
instance = [[y.get('instance', {}).get('value') for y in x] if x is not None else []  for x in data_fil]
inception = [[y.get('inception', {}).get('value') for y in x] if x is not None else [] for x in data_fil]
publication_date = [[y.get('publication_date', {}).get('value') for y in x] if x is not None else [] for x in data_fil]

df_object =  pd.DataFrame({'individual_wikidata_id':wikidata_indi, 
                         'individual_name':wikidata_indi_label, 
                         'work_wikidata_id':wikidata_object,
                         'work_name':wikidata_objectlabel,
                         'instance_label':instance_label,
                        'instance_wikidata_id':instance,
                         'inception':inception,
                         'publication_date':publication_date
                       })

df_object = df_object.apply(lambda col: col.explode())


In [83]:


df_object = df_object.dropna(subset=['individual_wikidata_id'])
df_object['individual_wikidata_id'] = df_object['individual_wikidata_id'].apply(lambda x : x.split('/entity/')[1])
df_object['work_wikidata_id'] = df_object['work_wikidata_id'].apply(lambda x : x.split('/entity/')[1] if 'entity' in x else None)
df_object['instance_wikidata_id'] = df_object['instance_wikidata_id'].apply(lambda x : x.split('/entity/')[1] if x is not None else None)
df_object['creation_year'] = df_object.apply(lambda row: row['publication_date'] if row['publication_date'] is not None else row['inception'], axis=1)
df_object['creation_year'] = df_object['creation_year'].apply(lambda x: clean_date(x))

In [84]:
df_object_insert = df_object.drop(['individual_wikidata_id', 'individual_name'], axis=1).drop_duplicates()

In [85]:
df_indi_object = df_object[['individual_wikidata_id', 'individual_name', 'work_wikidata_id', 'work_name']].drop_duplicates()

In [86]:
df_indi_object.to_sql('individual_created_work', conn, if_exists = 'replace', index=False)

778826

In [87]:
with open(WIKIDATA_RAW_DATA + "/instance_of_work_instance.json", 'r') as f:
    data_category = json.load(f)
data_category = [x for x in data_category if x != []]
data_category = [x for x in data_category if x is not None]

In [88]:
work_wikidata_id = [[y['subject']['value'] for y in x] if x is not None else [] for x in data_category]
work_label = [[y['subjectLabel']['value'] for y in x] if x is not None else [] for x in data_category]
instance_label = [[y.get('instanceLabel', {}).get('value') for y in x] if x is not None else []  for x in data_category]
instance = [[y.get('instance', {}).get('value') for y in x] if x is not None else []  for x in data_category]

In [89]:

df_category =  pd.DataFrame({'instance_wikidata_id':work_wikidata_id, 
                         'instance_label':work_label, 
                         'super_instance_label':instance_label,
                        'super_instance_wikidata_id':instance,
                       })

df_category = df_category.apply(lambda col: col.explode())
df_category['super_instance_wikidata_id'] = df_category['super_instance_wikidata_id'].apply(lambda x : x.split('/entity/')[1])
df_category['instance_wikidata_id'] = df_category['instance_wikidata_id'].apply(lambda x : x.split('/entity/')[1] if 'entity' in x else None)

In [90]:
df_object_insert = pd.merge(df_object_insert, df_category, on = ['instance_wikidata_id', 'instance_label'], how = 'outer')

In [91]:
df_instance_category = pd.read_csv(WIKIDATA_RAW_DATA + '/ENS - Clean Instance work - data.csv')
df_instance_category = df_instance_category[~df_instance_category['work_category'].isna()]
df_instance_category = df_instance_category[['instance_label', 'work_category']].drop_duplicates().reset_index(drop=True)

In [92]:
df_object_insert_category = pd.merge(df_object_insert, df_instance_category, on = 'instance_label', how = 'outer')

In [93]:
df_work_object_creation_year = pd.read_csv(WIKIDATA_RAW_DATA + '/work_aggregated_year.csv', index_col = [0])
df_object_insert_category = pd.merge(df_object_insert_category, df_work_object_creation_year, on = ['work_wikidata_id', 'work_name'], how = 'outer')
df_object_insert_category = df_object_insert_category[~df_object_insert_category['work_name'].isna()]


In [94]:
df_object_insert_category.to_sql('created_work', conn, if_exists = 'replace', index=False)

980876

In [95]:
import glob
from tqdm import tqdm

In [96]:
paths = glob.glob(WIKIDATA_RAW_DATA + '/object_identifiers/*')

In [97]:
data_fil = []

for path in tqdm(paths):
    with open(path, 'r') as f:
        res = json.load(f)
        res = [x for x in res if x != []]
        res = [x for x in res if x is not None]
        data_fil.append(res)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [01:01<00:00,  7.63s/it]


In [98]:
data_work_id = [item for sublist in data_fil for item in sublist]
data_work_id = [x for x in data_work_id if len(x) >1]

In [99]:
individual_wikidata_id = [x[0]['p']['value'] for x in data_work_id]
individual_name = [x[0]['pLabel']['value'] for x in data_work_id]

identifiers_wikidata_id = [[y['p']['value'] for y in x[1:]] for x in data_work_id]
identifier_name = [[y['pLabel']['value'] for y in x[1:]] for x in data_work_id]

df_rest =  pd.DataFrame({'individual_wikidata_id':individual_wikidata_id, 
                         'individual_name':individual_name, 
                         'identifiers_wikidata_id': identifiers_wikidata_id,
                         'identifier_name':identifier_name
                     
                       })

In [100]:
df_rest = df_rest.explode(['identifiers_wikidata_id', 'identifier_name'])
df_rest['individual_wikidata_id'] = df_rest['individual_wikidata_id'].apply(lambda x : x.split('/entity/')[1])
df_rest = df_rest.dropna()
df_rest['identifiers_wikidata_id'] = df_rest['identifiers_wikidata_id'].apply(lambda x : x.split('/entity/')[1] if ('entity' in x and x is not None) else None)
df_rest = df_rest[~df_rest['individual_wikidata_id'].str.startswith('P')]
df_rest = df_rest.drop(['identifier_name'], axis=1)

In [101]:
df_rest = pd.merge(df_rest, df_id[['identifiers_wikidata_id', 'identifier_name_country']].drop_duplicates(), on = ['identifiers_wikidata_id'])

In [102]:
df_rest.to_sql('created_work_identifiers', conn, if_exists = 'replace', index=False)

604688

In [103]:
#df_rest.identifier_name_country.value_counts().head(20)

#### Add VIAF Id

In [104]:
import json

with open(WIKIDATA_RAW_DATA + '/notable_viaf_id.json', 'r') as f:
    data_viaf = json.load(f)
    
data_fil = [x for x in data_viaf if x != []]

wikidata_indi = [[y['subject']['value'] for y in x] if x is not None else [] for x in data_fil]
wikidata_indi_label = [[y['subjectLabel']['value'] for y in x] if x is not None else [] for x in data_fil]
viaf_id = [[y['viaf_id']['value'] for y in x] if x is not None else []  for x in data_fil]

df =  pd.DataFrame({'individual_wikidata_id':wikidata_indi, 
                     'individual_name':wikidata_indi_label, 
                     'viaf_id':viaf_id               
                   })

df = df.apply(lambda col: col.explode())
df = df.dropna(subset=['individual_wikidata_id'])
df['individual_wikidata_id'] = df['individual_wikidata_id'].apply(lambda x : x.split('/entity/')[1])
df.to_sql(f'individual_viaf_id', conn, if_exists = 'replace', index=False)

178669

In [105]:
# add countries and continent information

In [106]:
df_country = pd.read_csv(WIKIDATA_RAW_DATA + '/countries_continent.csv')
df_country['country'] = df_country['country'].apply(lambda x : x.split('/entity/')[1])
df_country['continent'] = df_country['continent'].apply(lambda x : x.split('/entity/')[1])
df_country = df_country.rename(columns = {'countryLabel':'country_name', 'continentLabel':'continent_name'})

In [107]:
df_country.to_sql('country_continent', conn, if_exists = 'replace', index=False)

212