In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
# change display settings to show all dataframe columns as well as complete cell values (no truncating values)
# https://stackoverflow.com/questions/47022070/display-all-dataframe-columns-in-a-jupyter-python-notebook
pd.set_option('display.max_columns', None)

# https://stackoverflow.com/a/51540918
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## Read Data

In [3]:
# read data from cleaned file
!ls ../data/notable_people_cross-verified/

citation.ris                        wiki_notable_people_cleaned.csv
cross-verified-database.csv         wiki_notable_people_cleaned_iso.csv
cross-verified-database.csv.gz      wiki_notable_people_reduced.csv


In [4]:
# create dataframe from cleaned, reduced file

wiki = pd.read_csv("../data/notable_people_cross-verified/wiki_notable_people_cleaned.csv")

In [5]:
wiki.head()

Unnamed: 0,wikidata_code,birth,death,gender,occup_l1,name,un_subregion,bigperiod_birth,bigperiod_death,curid,occup_l2,occup_l3,avg_no_readers_2015_2018,non_missing_score,total_count_words,no_wiki_editions,no_external_links,notability_index_sum,notability_index_ranking,citizenship,un_region,group_wikipedia_editions,birth_place_lon,death_place_lon,birth_place_lat,death_place_lat,occup_l3_all
0,Q1000002,1932.0,1990.0,Male,Culture,Claus Hammel,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2949539,Culture-core,playwright,1669,3,1777,1,11,18.083672,1058542.0,Germany,Europe,grB,11.833333,12.42,53.416668,54.38139,D:_playwright_journalist_writer_screenwriter_P:_ drama_dramatiker_German
1,Q1000005,1860.0,1927.0,Male,Culture,Karel Matěj Čapek-Chod,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,4217319,Culture-core,writer,25008,3,6491,9,15,23.98061,131428.0,Czech Republic,Europe,grA,12.929798,14.421389,49.440605,50.087502,D:_writer_journalist_P:_naturalist_writer_journalist_English_ scrittore_Italian_ schriftsteller_journalist_vertreter_German_ författare_författare_Swedish
2,Q1000006,1971.0,,Male,Culture,Florian Eichinger,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,5050967,Culture-core,film,27285,3,1573,1,10,20.666656,775768.0,Germany,Europe,grB,9.191944,,48.897499,,D:_film_screenwriter_film_P:_regisseur_autor_film_German
3,Q1000015,1983.0,,Male,Culture,Florian Jahr,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2588583,Culture-core,actor,37331,3,1931,1,10,21.18504,691735.0,Germany,Europe,grB,13.383333,,52.516666,,D:_actor_P:_schauspiel_German
4,Q1000023,1912.0,1977.0,Female,Leadership,Wiltraut Rupp-von Brünneck,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,922120,Administration/Law,judge,2955,3,1578,1,6,17.99621,1103282.0,Germany,Europe,grB,13.35,8.4,52.4333,49.016666,D:_judge_jurist_P:_ richter_verfassung_German


## Add ISO codes for Countries of Citizenship using Country Converter Package

> [Link](https://pypi.org/project/country-converter/) to package

In [6]:
# import country converter package

import country_converter as coco

In [7]:
coco.convert(names=["Germany"], to="ISO3")

'DEU'

In [8]:
# create empty list for new dataframe series

citizenship_alpha_3 = []

In [9]:
wiki["citizenship"]

0                 Germany
1          Czech Republic
2                 Germany
3                 Germany
4                 Germany
                ...      
2291812          Slovenia
2291813    Czech Republic
2291814            France
2291815         Argentina
2291816       Switzerland
Name: citizenship, Length: 2291817, dtype: object

In [10]:
c = ["Czech Republic", "Germany"]

print(coco.convert(names=c, to="ISO3"))

['CZE', 'DEU']


In [7]:
# testing functionality of package with smaller sample

# iso3 = [coco.convert(names=wiki["citizenship"].sample(50), to="ISO3")]

In [13]:
#iso3

### Creating new Column "citizenship_iso3" and adding ISO3 Code per Row

> Very slow. Next time develop a more performant solution for this dataset.

In [6]:
# wiki["citizenship_iso3"] = coco.convert(names=wiki["citizenship"], to="ISO3", not_found=None)

In [15]:
wiki.head()

Unnamed: 0,wikidata_code,birth,death,gender,occup_l1,name,un_subregion,bigperiod_birth,bigperiod_death,curid,occup_l2,occup_l3,avg_no_readers_2015_2018,non_missing_score,total_count_words,no_wiki_editions,no_external_links,notability_index_sum,notability_index_ranking,citizenship,un_region,group_wikipedia_editions,birth_place_lon,death_place_lon,birth_place_lat,death_place_lat,occup_l3_all,citizenship_iso3
0,Q1000002,1932.0,1990.0,Male,Culture,Claus Hammel,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2949539,Culture-core,playwright,1669,3,1777,1,11,18.083672,1058542.0,Germany,Europe,grB,11.833333,12.42,53.416668,54.38139,D:_playwright_journalist_writer_screenwriter_P:_ drama_dramatiker_German,DEU
1,Q1000005,1860.0,1927.0,Male,Culture,Karel Matěj Čapek-Chod,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,4217319,Culture-core,writer,25008,3,6491,9,15,23.98061,131428.0,Czech Republic,Europe,grA,12.929798,14.421389,49.440605,50.087502,D:_writer_journalist_P:_naturalist_writer_journalist_English_ scrittore_Italian_ schriftsteller_journalist_vertreter_German_ författare_författare_Swedish,CZE
2,Q1000006,1971.0,,Male,Culture,Florian Eichinger,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,5050967,Culture-core,film,27285,3,1573,1,10,20.666656,775768.0,Germany,Europe,grB,9.191944,,48.897499,,D:_film_screenwriter_film_P:_regisseur_autor_film_German,DEU
3,Q1000015,1983.0,,Male,Culture,Florian Jahr,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2588583,Culture-core,actor,37331,3,1931,1,10,21.18504,691735.0,Germany,Europe,grB,13.383333,,52.516666,,D:_actor_P:_schauspiel_German,DEU
4,Q1000023,1912.0,1977.0,Female,Leadership,Wiltraut Rupp-von Brünneck,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,922120,Administration/Law,judge,2955,3,1578,1,6,17.99621,1103282.0,Germany,Europe,grB,13.35,8.4,52.4333,49.016666,D:_judge_jurist_P:_ richter_verfassung_German,DEU


In [16]:
# Write new file including ISO3 codes

# wiki.to_csv("../data/notable_people_cross-verified/wiki_notable_people_cleaned_iso.csv")