# Analyzing Wikipedia - Gender-Bias?

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
# change display settings to show all dataframe columns as well as complete cell values (no truncating values)
# https://stackoverflow.com/questions/47022070/display-all-dataframe-columns-in-a-jupyter-python-notebook
pd.set_option('display.max_columns', None)

# https://stackoverflow.com/a/51540918
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## Data Source

- **Project**: [A cross-verified database of notable people, 3500BC-2018AD](https://www.nature.com/articles/s41597-022-01369-4)
    - Paper: [Shareable Link](https://rdcu.be/c6YvW)
- **Download**: [A Brief History of Human Time - Cross-verified Dataset](https://data.sciencespo.fr/dataset.xhtml?persistentId=doi:10.21410/7E4/RDAG3O)

Includes data until 2018.

## Read reduced Data

In [3]:
!ls ../data/notable_people_cross-verified/

citation.ris                        wiki_notable_people_cleaned.csv
cross-verified-database.csv         wiki_notable_people_cleaned_iso.csv
cross-verified-database.csv.gz      wiki_notable_people_reduced.csv


In [5]:
# read file using utf-8 encoding to accommodate "ä", "ö", ... , "ß" etc.

wiki = pd.read_csv("../data/notable_people_cross-verified/wiki_notable_people_reduced.csv", 
                   encoding='utf-8', index_col=0)

In [6]:
wiki.head()

Unnamed: 0_level_0,birth,death,gender,level1_main_occ,name,un_subregion,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,level3_main_occ,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,citizenship_1_b,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Q1000002,1932.0,1990.0,Male,Culture,Claus_Hammel,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2949539,Culture-core,playwright,1669,3,1777,1,11,18.083672,1058542.0,Germany,Europe,grB,11.833333,12.42,53.416668,54.38139,D:_playwright_journalist_writer_screenwriter_P:_ drama_dramatiker_German
Q1000005,1860.0,1927.0,Male,Culture,Karel_Matěj_Čapek-Chod,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,4217319,Culture-core,writer,25008,3,6491,9,15,23.98061,131428.0,Czech_Republic,Europe,grA,12.929798,14.421389,49.440605,50.087502,D:_writer_journalist_P:_naturalist_writer_journalist_English_ scrittore_Italian_ schriftsteller_journalist_vertreter_German_ författare_författare_Swedish
Q1000006,1971.0,,Male,Culture,Florian_Eichinger,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,5050967,Culture-core,film,27285,3,1573,1,10,20.666656,775768.0,Germany,Europe,grB,9.191944,,48.897499,,D:_film_screenwriter_film_P:_regisseur_autor_film_German
Q1000015,1983.0,,Male,Culture,Florian_Jahr,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2588583,Culture-core,actor,37331,3,1931,1,10,21.18504,691735.0,Germany,Europe,grB,13.383333,,52.516666,,D:_actor_P:_schauspiel_German
Q1000023,1912.0,1977.0,Female,Leadership,Wiltraut_Rupp-von_Brünneck,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,922120,Administration/Law,judge,2955,3,1578,1,6,17.99621,1103282.0,Germany,Europe,grB,13.35,8.4,52.4333,49.016666,D:_judge_jurist_P:_ richter_verfassung_German


In [7]:
wiki.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2291817 entries, Q1000002 to Q999999
Data columns (total 26 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   birth                     float64
 1   death                     float64
 2   gender                    object 
 3   level1_main_occ           object 
 4   name                      object 
 5   un_subregion              object 
 6   bigperiod_birth_graph_b   object 
 7   bigperiod_death_graph_b   object 
 8   curid                     int64  
 9   level2_main_occ           object 
 10  level3_main_occ           object 
 11  wiki_readers_2015_2018    int64  
 12  non_missing_score         int64  
 13  total_count_words_b       int64  
 14  number_wiki_editions      int64  
 15  total_noccur_links_b      int64  
 16  sum_visib_ln_5criteria    float64
 17  ranking_visib_5criteria   float64
 18  citizenship_1_b           object 
 19  un_region                 object 
 20  group_wikipedia_editio

In [19]:
# wiki.isnull().sum()

In [8]:
# transform year data from float to integer data type
wiki["birth"] = wiki["birth"].astype("Int64")

In [9]:
wiki["death"] = wiki["death"].astype("Int64")

## First Data Checks based on Notability Index

In [27]:
wiki.sort_values(by="ranking_visib_5criteria").head(10)

Unnamed: 0_level_0,birth,death,gender,level1_main_occ,name,un_subregion,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,level3_main_occ,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,citizenship_1_b,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Q76,1961,,Male,Leadership,Barack_Obama,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,534366,Politics,politician,75101432,3,301113,229,117,40.958359,1.0,US,America,grA,-157.833466,,21.299843,,D:_politician_lawyer_writer_statesperson_P:_attorney_politician_senator_English_ audio_politico_Italian_politiker_German_político_senad_senador_Spanish_advogado_polític_ciência_Portuguese_advokat_politiker_politisk_Swedish
Q22686,1946,,Male,Leadership,Donald_Trump,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,4848272,Corporate/Executive/Business (large),magnate,218433920,3,326128,198,96,41.765057,2.0,US,America,grA,-73.816002,,40.7005,,D:_magnate_investor_restaurateur_writer_entrepreneur_politician_business_developer_game_entrepreneur_producer_film_writer_actor_actor_chief_executive_chief_executive_P:_politics_business_television_English_affaires_animateur_French_imprenditore_politico_Italian_entertainer_unternehmer_marketing_German_política_apolític_empresario_Spanish_empresário_polític_republican_Portuguese_republikan_politiker_affärsman_Swedish
Q762,1452,1519.0,Male,Discovery/Science,Leonardo_da_Vinci,Southern Europe,2.Post-Classical History 501-1500AD,3.Early Modern Period 1501-1750AD,18079,Academia,polymath,50310696,3,190553,212,116,40.014885,3.0,Italy,Europe,grA,10.938133,0.9825,43.799168,47.411388,D:_painter_engineer_astronomer_philosopher_anatomist_mathematician_sculptor_polymath_architect_engineer_diplomat_inventor_composer_poet_music_physicist_physicist_P:_polymath_invention_painting_English_peintre_artiste_artiste_French_ingegnere_pittore_scienziato_Italian_bildhauer_architekt_anatom_German_pintor_anatomía_anatomista_Spanish_polímata_ciência_cientista_Portuguese_konstnär_arkitekt_ingenjör_Swedish
Q352,1889,1945.0,Male,Leadership,Adolf_Hitler,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,2731583,Politics,politician,97209664,3,322673,205,91,40.926445,4.0,Austria,Europe,grA,13.033334,13.383333,48.258335,52.516666,D:_soldier_painter_writer_revolutionary_statesperson_P:_politician_revolutionary_nazi_English_nazi_impérial_antisémite_French_politico_dittatore_nazista_Italian_diktator_German_ nazi_político_militar_Spanish_polític_líder_nazista_Portuguese_politiker_ordförande_arbetare_Swedish
Q937,1879,1955.0,Male,Discovery/Science,Albert_Einstein,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,736,Academia,physicist,73712240,3,170244,202,123,40.294163,5.0,Germany,Europe,grA,9.99155,-74.656944,48.398411,40.352222,D:_physicist_philosopher_inventor_writer_education_academic_professor_physicist_writer_philosopher_writer_teacher_scientist_mathematician_author_P:_physicist_physics_quantum_English_physicien_théoricien_études_French_fisico_filosofo_Italian_bürgerrecht_German_físic_científic_Spanish_físic_teórico_física_Portuguese_ fysik_fysiker_kreativ_Swedish
Q2831,1958,2009.0,Male,Culture,Michael_Jackson,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,14995351,Culture-core,dancer,68214696,3,237174,235,91,40.400345,6.0,US,America,grA,-87.345558,-118.243683,41.580833,34.052231,D:_dancer_singer_business_philanthropist_film_screenwriter_poet_biographer_producer_choreographer_actor_boxing_actor_entrepreneur_music_P:_singer_songwriter_dancer_English_auteur_compositeur_interprète_French_cantante_compositore_ballerino_Italian_sänger_tänzer_songwriter_German_cantante_compositor_productor_Spanish_cantor_compositor_dança_Portuguese_sångare_dansare_låtskrivare_Swedish
Q8016,1874,1965.0,Male,Leadership,Winston_Churchill,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,33265,Politics,politician,49218684,3,279860,147,111,39.969543,7.0,United_Kingdom,Europe,grA,-1.361389,-0.182722,51.841946,51.500401,D:_politician_journalist_painter_historian_biographer_screenwriter_biographer_P:_politician_statesman_army_English_royaume_nazi_ministre_French_politico_storico_giornalista_Italian_autor_jiddisch_lizenz_German_político_estadista_escritor_Spanish_polític_conservador_estadista_Portuguese_konservativ_liberal_handel_Swedish
Q692,1564,1616.0,Male,Culture,William_Shakespeare,Western Europe,3.Early Modern Period 1501-1750AD,3.Early Modern Period 1501-1750AD,32897,Culture-core,playwright,46740508,3,161092,192,133,39.81039,8.0,United_Kingdom,Europe,grA,-1.706389,-1.706389,52.19278,52.19278,D:_playwright_poet_actor_writer_actor_drama_author_P:_ poet_playwright_actor_English_ poète_dramaturge_écrivain_French_ dramma_drammaturgo_poeta_Italian_ drama_dramatiker_lyriker_German_ drama_dramaturg_poeta_Spanish_ poeta_drama_dramaturgo_Portuguese_ drama_dramatiker_poet_Swedish
Q517,1769,1821.0,Male,Leadership,Napoleon,Western Europe,4.Mid Modern Period 1751-1900AD,4.Mid Modern Period 1751-1900AD,69880,Politics,politician,49868980,3,210001,189,90,39.737667,9.0,France,Europe,grA,8.736389,-5.683022,41.925556,-15.950222,D:_politician_statesperson_officer_militar_P:_statesman_militar_french revolution_English_empereur_enfant_militaire_French_politico_generale_Italian_revolutionär_diktator_kaiser_German_militar_gobernante_general_Spanish_líder_polític_revolução francesa_Portuguese_ huset_kejsare_Swedish
Q303,1935,1977.0,Male,Culture,Elvis_Presley,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,9288,Culture-core,actor,46304640,3,203282,152,97,39.488514,10.0,US,America,grA,-88.703392,-89.971107,34.25761,35.1175,D:_actor_singer_screenwriter_guitar_soldier_pianist_P:_singer_actor_rock_English_chanteur_acteur_roi_French_cantante_attore_musicista_Italian_sänger_musik_musiker_German_cantante_bajo_rey_Spanish_cantor_compositor_ator_Portuguese_ rock_sångare_musik_Swedish


In [28]:
wiki.sort_values(by="sum_visib_ln_5criteria", ascending=False).head(10)

Unnamed: 0_level_0,birth,death,gender,level1_main_occ,name,un_subregion,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,level3_main_occ,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,citizenship_1_b,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Q22686,1946,,Male,Leadership,Donald_Trump,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,4848272,Corporate/Executive/Business (large),magnate,218433920,3,326128,198,96,41.765057,2.0,US,America,grA,-73.816002,,40.7005,,D:_magnate_investor_restaurateur_writer_entrepreneur_politician_business_developer_game_entrepreneur_producer_film_writer_actor_actor_chief_executive_chief_executive_P:_politics_business_television_English_affaires_animateur_French_imprenditore_politico_Italian_entertainer_unternehmer_marketing_German_política_apolític_empresario_Spanish_empresário_polític_republican_Portuguese_republikan_politiker_affärsman_Swedish
Q76,1961,,Male,Leadership,Barack_Obama,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,534366,Politics,politician,75101432,3,301113,229,117,40.958359,1.0,US,America,grA,-157.833466,,21.299843,,D:_politician_lawyer_writer_statesperson_P:_attorney_politician_senator_English_ audio_politico_Italian_politiker_German_político_senad_senador_Spanish_advogado_polític_ciência_Portuguese_advokat_politiker_politisk_Swedish
Q352,1889,1945.0,Male,Leadership,Adolf_Hitler,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,2731583,Politics,politician,97209664,3,322673,205,91,40.926445,4.0,Austria,Europe,grA,13.033334,13.383333,48.258335,52.516666,D:_soldier_painter_writer_revolutionary_statesperson_P:_politician_revolutionary_nazi_English_nazi_impérial_antisémite_French_politico_dittatore_nazista_Italian_diktator_German_ nazi_político_militar_Spanish_polític_líder_nazista_Portuguese_politiker_ordförande_arbetare_Swedish
Q2831,1958,2009.0,Male,Culture,Michael_Jackson,Northern America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,14995351,Culture-core,dancer,68214696,3,237174,235,91,40.400345,6.0,US,America,grA,-87.345558,-118.243683,41.580833,34.052231,D:_dancer_singer_business_philanthropist_film_screenwriter_poet_biographer_producer_choreographer_actor_boxing_actor_entrepreneur_music_P:_singer_songwriter_dancer_English_auteur_compositeur_interprète_French_cantante_compositore_ballerino_Italian_sänger_tänzer_songwriter_German_cantante_compositor_productor_Spanish_cantor_compositor_dança_Portuguese_sångare_dansare_låtskrivare_Swedish
Q937,1879,1955.0,Male,Discovery/Science,Albert_Einstein,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,736,Academia,physicist,73712240,3,170244,202,123,40.294163,5.0,Germany,Europe,grA,9.99155,-74.656944,48.398411,40.352222,D:_physicist_philosopher_inventor_writer_education_academic_professor_physicist_writer_philosopher_writer_teacher_scientist_mathematician_author_P:_physicist_physics_quantum_English_physicien_théoricien_études_French_fisico_filosofo_Italian_bürgerrecht_German_físic_científic_Spanish_físic_teórico_física_Portuguese_ fysik_fysiker_kreativ_Swedish
Q615,1987,,Male,Sports/Games,Lionel_Messi,South America,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2150841,Sports/Games,football,100513392,3,312419,134,59,40.077526,88.0,Argentina,America,grA,-60.639446,,-32.9575,,D:_football_P:_football_forward_player_English_football_French_calciatore_cittadina_centrocampista_Italian_fußball_German_futbolista_club_fútbol_Spanish_futebol_atacante_Portuguese_fotbollsspelare_Swedish
Q762,1452,1519.0,Male,Discovery/Science,Leonardo_da_Vinci,Southern Europe,2.Post-Classical History 501-1500AD,3.Early Modern Period 1501-1750AD,18079,Academia,polymath,50310696,3,190553,212,116,40.014885,3.0,Italy,Europe,grA,10.938133,0.9825,43.799168,47.411388,D:_painter_engineer_astronomer_philosopher_anatomist_mathematician_sculptor_polymath_architect_engineer_diplomat_inventor_composer_poet_music_physicist_physicist_P:_polymath_invention_painting_English_peintre_artiste_artiste_French_ingegnere_pittore_scienziato_Italian_bildhauer_architekt_anatom_German_pintor_anatomía_anatomista_Spanish_polímata_ciência_cientista_Portuguese_konstnär_arkitekt_ingenjör_Swedish
Q11571,1985,,Male,Sports/Games,Cristiano_Ronaldo,Southern Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,623737,Sports/Games,football,133777256,3,279612,124,50,40.012989,182.0,Portugal,Europe,grA,-16.916666,,32.650002,,D:_football_entrepreneur_model_P:_football_forward_club_English_football_French_calciatore_attaccante_football_Italian_fußball_German_infante_fútbol_futbolista_Spanish_futebol_remo_Portuguese_fotboll_fotbollsspelare_sport_Swedish
Q8016,1874,1965.0,Male,Leadership,Winston_Churchill,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,33265,Politics,politician,49218684,3,279860,147,111,39.969543,7.0,United_Kingdom,Europe,grA,-1.361389,-0.182722,51.841946,51.500401,D:_politician_journalist_painter_historian_biographer_screenwriter_biographer_P:_politician_statesman_army_English_royaume_nazi_ministre_French_politico_storico_giornalista_Italian_autor_jiddisch_lizenz_German_político_estadista_escritor_Spanish_polític_conservador_estadista_Portuguese_konservativ_liberal_handel_Swedish
Q692,1564,1616.0,Male,Culture,William_Shakespeare,Western Europe,3.Early Modern Period 1501-1750AD,3.Early Modern Period 1501-1750AD,32897,Culture-core,playwright,46740508,3,161092,192,133,39.81039,8.0,United_Kingdom,Europe,grA,-1.706389,-1.706389,52.19278,52.19278,D:_playwright_poet_actor_writer_actor_drama_author_P:_ poet_playwright_actor_English_ poète_dramaturge_écrivain_French_ dramma_drammaturgo_poeta_Italian_ drama_dramatiker_lyriker_German_ drama_dramaturg_poeta_Spanish_ poeta_drama_dramaturgo_Portuguese_ drama_dramatiker_poet_Swedish


In [14]:
# check number of null values

wiki.isnull().sum()

birth                        195919
death                       1244507
gender                         1398
level1_main_occ                   0
name                              0
un_subregion                  53567
bigperiod_birth_graph_b      112528
bigperiod_death_graph_b      112528
curid                             0
level2_main_occ                   0
level3_main_occ               15959
wiki_readers_2015_2018            0
non_missing_score                 0
total_count_words_b               0
number_wiki_editions              0
total_noccur_links_b              0
sum_visib_ln_5criteria            0
ranking_visib_5criteria           0
citizenship_1_b               53499
un_region                     53567
group_wikipedia_editions          0
bplo1                        587627
dplo1                       1614372
bpla1                        587627
dpla1                       1614372
level3_all_occ                    0
dtype: int64

In [8]:
#wiki.info()

## Rename Columns

### Notability Index
Two alternative ways to rank, computed from the same 5 variables below
- ranking_visib_5criteria --> **notability_index_ranking**
- sum_visib_ln_5criteria --> **notability_index_sum**
    - An alternative ranking based on the sum of the log of these variables plus one
    - Slightly different result of ranking

5 variables used to compute notability (with renaming in bold):
- wiki_readers_2015_2018 --> **avg_no_readers_2015_2018**: 
    - the average number of biography views (hits) for each individual between 2015 and 2018 in all available language editions, using an API available [here](https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews) 
    - or zero in the absence of a Wikipedia biography
- **non_missing_score** --> _name stays same_: the number of non-missing items retrieved from Wikipedia or Wikidata for birth date, gender and domain of influence
    - The intuition here is that the more notable the individual, the more documented his/her biographies will be
- total_count_words_b --> **total_count_words**: the length, i.e total number of words found in all available biographies
    - It is equal to zero for individuals with just one Wikidata entry and no biography in Wikipedia
- number_wiki_editions --> **no_wiki_editions**: the number of Wikipedia editions of each individual
- total_noccur_links_b --> **no_external_links**: the total number of external links (sources, references, etc.) from Wikidata

Source: https://www.nature.com/articles/s41597-022-01369-4#Sec2 (section "Measuring notability")

In [30]:
# rename notability index columns
wiki.rename(columns={"sum_visib_ln_5criteria" : "notability_index_sum", 
                     "ranking_visib_5criteria" : "notability_index_ranking",
                     "wiki_readers_2015_2018" : "avg_no_readers_2015_2018",
                     #"non_missing_score" : "",
                     "total_count_words_b" : "total_count_words",
                     "number_wiki_editions" : "no_wiki_editions",
                     "total_noccur_links_b" : "no_external_links"
                    }, inplace=True)

### Occupation Columns

Categories:
- **Discovery/Science**:
    - Academia (Research, Historian, Physician, Scientist, Academic, etc.)
    - Explorer (Engineer, Explorer, Inventor, Sailor, Pioneer, etc.)
- **Culture**:
    - Core (Actor, Writer, Painter, Singer, Music, etc.)
    - Periphery (Journalist, Architect, Model, Designer, Presenter, etc.)
- **Leadership**:
    - Politics (Politician, Activist, Revolutionary, Trade unionist, Minister, etc.)
    - Military (Military, Officer, Commander, Soldier, Army, etc.)
    - Law (Lawyer, Diplomat, Judge, Jurist, Civil service)
    - Nobility (Aristocrat, Noble, King, Sovereign, Monarch, etc.)
    - Religious (Priest, Prelate, Rabbi, Missionary, Bishop, etc.)
    - Corporate Leadership (Business, Entrepreneur, Bank, Merchant, Manager, etc.)
- **Sports/Games**: 
    - Football, Player, Sport, Baseball, Basket, etc.
- **Other**:
    - Worker (Farmer, Librarian, Musher, Bookseller, Printer, etc.)
    - Family (Son, Daughter, Child, Wife of, Father, etc.)
    - Misc. (Esperantist, Criminal, Convict, Killer, Philanthropist, etc.)

Source: https://www.nature.com/articles/s41597-022-01369-4#Sec2 (section "Domains of influence and occupations")

Renaming:
- level1_main_occ --> **occup_l1**
- level2_main_occ --> **occup_l2**
- level3_main_occ --> **occup_l3**
- level3_all_occ  --> **occup_l3_all**

In [34]:
# rename occupation columns
wiki.rename(columns={"level1_main_occ" : "occup_l1", 
                     "level2_main_occ" : "occup_l2",
                     "level3_main_occ" : "occup_l3",
                     "level3_all_occ"  : "occup_l3_all"
                    }, inplace=True)

In [37]:
# wiki.info()

### Other Columns

Geolocalisation: _using longitudes and latitudes of birth and death places_
- bplo1 --> **birth_place_lon**
- dplo1 --> **death_place_lon**
- bpla1 --> **birth_place_lat**
- dpla1 --> **death_place_lat**

Periods of human history:
- bigperiod_birth_graph_b --> **bigperiod_birth**
- bigperiod_death_graph_b --> **bigperiod_death**

Citizenship:
- citizenship_1_b --> **citizenship**

Source: https://www.nature.com/articles/s41597-022-01369-4#Sec2 

In [36]:
# rename other columns
wiki.rename(columns={"bplo1" : "birth_place_lon", 
                     "dplo1" : "death_place_lon",
                     "bpla1" : "birth_place_lat",
                     "dpla1" : "death_place_lat",
                     "bigperiod_birth_graph_b" : "bigperiod_birth",
                     "bigperiod_death_graph_b" : "bigperiod_death",
                     "citizenship_1_b" : "citizenship"
                    }, inplace=True)

In [38]:
wiki.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2291817 entries, Q1000002 to Q999999
Data columns (total 26 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   birth                     Int64  
 1   death                     Int64  
 2   gender                    object 
 3   occup_l1                  object 
 4   name                      object 
 5   un_subregion              object 
 6   bigperiod_birth           object 
 7   bigperiod_death           object 
 8   curid                     int64  
 9   occup_l2                  object 
 10  occup_l3                  object 
 11  avg_no_readers_2015_2018  int64  
 12  non_missing_score         int64  
 13  total_count_words         int64  
 14  no_wiki_editions          int64  
 15  no_external_links         int64  
 16  notability_index_sum      float64
 17  notability_index_ranking  float64
 18  citizenship               object 
 19  un_region                 object 
 20  group_wikipedia_editio

In [40]:
#wiki.head(10)

## Clean "name" Column

In [43]:
# replace _underscores with whitespace
wiki["name"] = wiki["name"].str.replace("_", " ")

In [57]:
# remove everything in () behind actual name
wiki["name"] = wiki["name"].str.replace(r" \(.*\)", "")

  wiki["name"] = wiki["name"].str.replace(r" \(.*\)", "")


## Clean "citizenship" Column

In [61]:
wiki["citizenship"].unique()

array(['Germany', 'Czech_Republic', 'US', 'Austria', 'Poland', 'Russia',
       'Ukraine', 'Belgium', 'Italy', 'France', 'Norway', 'Philippines',
       'Brazil', 'Ireland', 'Cameroon', 'Iran', 'Hungary', 'China',
       'United_Kingdom', 'Canada', nan, 'Japan', 'Austria-Hungary',
       'Switzerland', 'Venezuela', 'Ecuador', 'Portugal', 'Netherlands',
       'Greece', 'India', 'Bulgaria', 'Egypt', 'Argentina', 'Romania',
       'Colombia', 'Tuvalu', 'Serbia', 'Chile', 'South_Korea',
       'Indonesia', 'Croatia', 'Finland', 'Vietnam', 'Roman_Empire',
       'Kuwait', 'Singapore', 'Spain', 'Congo', 'Denmark', 'Malaysia',
       'Dominica', 'Nigeria', 'Tunisia', 'Turkey', 'Laos', 'Mexico',
       'Australia', 'Paraguay', 'Sweden', 'Zimbabwe', 'Honduras',
       'Grenada', 'Slovenia', 'Pakistan', 'Georgia', 'Kazakhstan',
       'Vanuatu', 'Ghana', 'New_Zealand', 'Syria', 'Holy_Roman_Empire',
       'Serbia_and_Montenegro', 'Jamaica', 'Belarus', 'Montenegro',
       'Hong_Kong', 'Taiwan',

In [63]:
# replace _underscores with whitespace
wiki["citizenship"] = wiki["citizenship"].str.replace("_", " ")

In [67]:
#wiki.sample(50)

## Check unique Values and Counts after Data Cleaning

In [68]:
wiki["citizenship"].unique()

array(['Germany', 'Czech Republic', 'US', 'Austria', 'Poland', 'Russia',
       'Ukraine', 'Belgium', 'Italy', 'France', 'Norway', 'Philippines',
       'Brazil', 'Ireland', 'Cameroon', 'Iran', 'Hungary', 'China',
       'United Kingdom', 'Canada', nan, 'Japan', 'Austria-Hungary',
       'Switzerland', 'Venezuela', 'Ecuador', 'Portugal', 'Netherlands',
       'Greece', 'India', 'Bulgaria', 'Egypt', 'Argentina', 'Romania',
       'Colombia', 'Tuvalu', 'Serbia', 'Chile', 'South Korea',
       'Indonesia', 'Croatia', 'Finland', 'Vietnam', 'Roman Empire',
       'Kuwait', 'Singapore', 'Spain', 'Congo', 'Denmark', 'Malaysia',
       'Dominica', 'Nigeria', 'Tunisia', 'Turkey', 'Laos', 'Mexico',
       'Australia', 'Paraguay', 'Sweden', 'Zimbabwe', 'Honduras',
       'Grenada', 'Slovenia', 'Pakistan', 'Georgia', 'Kazakhstan',
       'Vanuatu', 'Ghana', 'New Zealand', 'Syria', 'Holy Roman Empire',
       'Serbia and Montenegro', 'Jamaica', 'Belarus', 'Montenegro',
       'Hong Kong', 'Taiwan',

In [69]:
wiki["birth"].unique()

<IntegerArray>
[ 1932,  1860,  1971,  1983,  1912,  1928,  1818,  1907,  1915,  1891,
 ...
  -293,  -864,  -398,  -289,  -294,  -599,  -274, -1440, -1648,  -586]
Length: 2639, dtype: Int64

In [70]:
wiki["death"].unique()

<IntegerArray>
[ 1990,  1927,  <NA>,  1977,  2016,  1894,  1980,  1987,  1962,  1951,
 ...
  -244,  -996,  -173, -1540,  -527,  -452,  -540, -1415,  -593,  -532]
Length: 2941, dtype: Int64

In [71]:
wiki["gender"].unique()

array(['Male', 'Female', 'Other', nan], dtype=object)

In [72]:
wiki["occup_l1"].unique()

array(['Culture', 'Leadership', 'Discovery/Science', 'Sports/Games',
       'Other', 'Missing'], dtype=object)

In [73]:
wiki["occup_l1"].value_counts()

Culture              702330
Sports/Games         633450
Leadership           619146
Discovery/Science    273229
Other                 48245
Missing               15417
Name: occup_l1, dtype: int64

In [74]:
wiki["occup_l2"].unique()

array(['Culture-core', 'Administration/Law', 'Academia',
       'Culture-periphery', 'Sports/Games', 'Politics', 'Religious',
       'Other', 'Worker/Business (small)', 'Military', 'Nobility',
       'Corporate/Executive/Business (large)', 'Family', 'Missing',
       'Explorer/Inventor/Developer'], dtype=object)

In [75]:
wiki["occup_l2"].value_counts()

Sports/Games                            634945
Culture-core                            604250
Politics                                314558
Academia                                249298
Culture-periphery                        97702
Administration/Law                       76884
Religious                                72379
Military                                 71582
Corporate/Executive/Business (large)     46266
Nobility                                 37477
Explorer/Inventor/Developer              22814
Worker/Business (small)                  22796
Missing                                  15417
Other                                    14579
Family                                   10870
Name: occup_l2, dtype: int64

In [76]:
wiki["occup_l3"].value_counts()

politician      270513
football        250667
actor           121790
writer           74716
painter          60189
                 ...  
homme_d_état         1
kulstötare           1
eremit               1
armurier             1
_diener              1
Name: occup_l3, Length: 4985, dtype: int64

In [77]:
wiki["un_subregion"].unique()

array(['Western Europe', 'Northern America', 'Eastern Europe',
       'Southern Europe', 'Northern Europe', 'SouthEast Asia',
       'South America', 'Central Africa',
       'Western Asia (Middle East Caucasus)', 'Eastern Asia', nan,
       'South Asia incl. Indian Peninsula', 'North Africa',
       'Oceania not Aus Nze', 'Caribbean', 'West Africa',
       'Central America', 'Oceania Western World', 'East Africa',
       'Central Asia', 'Southern Africa'], dtype=object)

In [78]:
wiki["un_region"].unique()

array(['Europe', 'America', 'Asia', 'Africa', nan, 'Oceania'],
      dtype=object)

In [80]:
wiki["bigperiod_birth"].value_counts()

5.Contemporary period 1901-2020AD      1518405
4.Mid Modern Period 1751-1900AD         487305
3.Early Modern Period 1501-1750AD       103722
2.Post-Classical History 501-1500AD      55297
1.Ancient History Before 500AD           14560
Name: bigperiod_birth, dtype: int64

In [81]:
wiki["bigperiod_death"].value_counts()

5.Contemporary period 1901-2020AD      1862123
4.Mid Modern Period 1751-1900AD         182459
3.Early Modern Period 1501-1750AD        76031
2.Post-Classical History 501-1500AD      45037
1.Ancient History Before 500AD           13639
Name: bigperiod_death, dtype: int64

In [82]:
wiki["non_missing_score"].unique()

array([3, 2, 1, 0])

In [83]:
wiki["notability_index_ranking"].unique()

array([1058542.,  131428.,  775768., ...,  154890.,  453953.,  344726.])

## Write cleaned Data into new File

In [84]:
wiki.shape

(2291817, 26)

In [85]:
#wiki.to_csv("../data/notable_people_cross-verified/wiki_notable_people_cleaned.csv")