# Analyzing Wikipedia - Gender-Bias?

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
#change display settings to show all dataframe columns as well as complete cell values (no truncating values)
#https://stackoverflow.com/questions/47022070/display-all-dataframe-columns-in-a-jupyter-python-notebook
pd.set_option('display.max_columns', None)

#https://stackoverflow.com/a/51540918
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## Data Source

- **Project**: [A cross-verified database of notable people, 3500BC-2018AD](https://www.nature.com/articles/s41597-022-01369-4)
    - Paper: [Shareable Link](https://rdcu.be/c6YvW)
- **Download**: [A Brief History of Human Time - Cross-verified Dataset](https://data.sciencespo.fr/dataset.xhtml?persistentId=doi:10.21410/7E4/RDAG3O)

Includes data until 2018.

## Read Data

In [7]:
#!ls ../data/notable_people_cross-verified/

In [4]:
#read file using utf-8 encoding to accommodate "ä", "ö", ... , "ß" etc.

notable = pd.read_csv("../data/notable_people_cross-verified/cross-verified-database.csv", 
                      encoding='utf-8', index_col=0)

In [5]:
#result: all 2.29 million individuals imported

In [6]:
#checking no. of records, columns and data types

notable.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2291817 entries, Q1000002 to Q999999
Data columns (total 48 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   birth                     float64
 1   death                     float64
 2   updated_death_date        float64
 3   approx_birth              object 
 4   approx_death              object 
 5   birth_min                 float64
 6   birth_max                 float64
 7   death_min                 float64
 8   death_max                 float64
 9   gender                    object 
 10  level1_main_occ           object 
 11  name                      object 
 12  un_subregion              object 
 13  birth_estimation          float64
 14  death_estimation          float64
 15  bigperiod_birth_graph_b   object 
 16  bigperiod_death_graph_b   object 
 17  curid                     int64  
 18  level2_main_occ           object 
 19  freq_main_occ             float64
 20  freq_second_occ       

In [6]:
notable.sample(5)

Unnamed: 0_level_0,birth,death,updated_death_date,approx_birth,approx_death,birth_min,birth_max,death_min,death_max,gender,level1_main_occ,name,un_subregion,birth_estimation,death_estimation,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,freq_main_occ,freq_second_occ,level2_second_occ,level3_main_occ,bigperiod_birth,bigperiod_death,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,all_geography_groups,string_citizenship_raw_d,citizenship_1_b,citizenship_2_b,list_areas_of_rattach,area1_of_rattachment,area2_of_rattachment,list_wikipedia_editions,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,pantheon_1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
Q50562232,1646.0,1727.0,,,,1646.0,1646.0,1727.0,1727.0,Male,Missing,Johann_Philipp_Fuchs_von_Dornheim,Western Europe,1646.0,1727.0,3.Early Modern Period 1501-1750AD,3.Early Modern Period 1501-1750AD,10265650,Missing,,,Missing,,3.Early Modern Period 1501-1750AD,3.Early Modern Period 1501-1750AD,237,2,1388,1,0,13.126601,2828352.0,,,Germany,,P:_'Germany'_can't_verifyB1,Old_(before_year_1990_AD)_Germany,Missing,dewiki,Europe,grB,,9.929444,,49.794445,0,D:_P:
Q27921768,1765.0,1850.0,,,,1765.0,1765.0,1850.0,1850.0,Male,Leadership,August_Erich_Johann_von_Berger,Western Europe,1765.0,1850.0,4.Mid Modern Period 1751-1900AD,4.Mid Modern Period 1751-1900AD,9675869,Military,0.666667,0.333333,Nobility,leutnant,4.Mid Modern Period 1751-1900AD,4.Mid Modern Period 1751-1900AD,425,3,1707,1,0,14.197682,2607876.0,,,Germany,,P:_'Germany'_can't_verifyB1,Old_(before_year_1990_AD)_Germany,Missing,dewiki,Europe,grB,10.0825,9.738611,52.625557,52.374443,0,D:_P:_königlich_leutnant_infanterie_German
Q3426550,1900.0,1981.0,,,,1900.0,1900.0,1981.0,1981.0,Male,Culture,René_Levrel,Western Europe,1900.0,1981.0,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,3241986,Culture-core,1.0,,Missing,painter,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,1648,3,5054,1,9,18.933607,845170.0,France,'France',France,,D:_'France'_matchB1_P:_'France',France,Missing,frwiki,Europe,grB,-1.553889,0.429444,47.217224,47.837776,0,D:_painter_P:_peintre_arts_French
Q10382197,1952.0,,,,,1952.0,1952.0,,,Male,Discovery/Science,Thomas_Synger,South America,1952.0,2037.5359,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2466095,Academia,1.0,,Missing,archaeologist,5.Contemporary period 1901-2020AD,Missing,417,3,1688,1,0,14.167672,2618629.0,Brazil,'Brazil',Brazil,,D:_'Brazil'_matchB1_P:_'Brazil',Brazil,Missing,ptwiki,America,grB,-43.196388,,-22.908333,,0,D:_archaeologist_P:_professor_historiador_arqueólog_Portuguese
Q582988,1977.0,,,,,1977.0,1977.0,,,Male,Sports/Games,Matt_Duke,Western Europe,1977.0,2054.1047,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,5675402,Sports/Games,1.0,,Missing,football,5.Contemporary period 1901-2020AD,Missing,33110,3,8337,7,2,22.614344,369871.5,United_Kingdom_of_Great_Britain_and_Northern_Ireland,'United_Kingdom',United_Kingdom,,D:_'United_Kingdom'_matchB1_P:_'England',United_Kingdom,Missing,huwiki|arwiki|dewiki|enwiki|itwiki|plwiki|fawiki,Europe,grA,-1.4659,,53.382969,,0,D:_football_P:_football_goalkeeper_coach_English_calciatore_portiere_Italian_fußball_torwart_German


In [8]:
notable["bigperiod_birth_graph_b"].unique()

array(['5.Contemporary period 1901-2020AD',
       '4.Mid Modern Period 1751-1900AD',
       '2.Post-Classical History 501-1500AD',
       '3.Early Modern Period 1501-1750AD',
       '1.Ancient History Before 500AD', nan], dtype=object)

In [9]:
notable["bigperiod_death_graph_b"].unique()

array(['5.Contemporary period 1901-2020AD',
       '4.Mid Modern Period 1751-1900AD',
       '2.Post-Classical History 501-1500AD',
       '3.Early Modern Period 1501-1750AD',
       '1.Ancient History Before 500AD', nan], dtype=object)

In [10]:
notable["bigperiod_birth"].unique()

array(['5.Contemporary period 1901-2020AD',
       '4.Mid Modern Period 1751-1900AD', 'Missing',
       '2.Post-Classical History 501-1500AD',
       '3.Early Modern Period 1501-1750AD',
       '1.Ancient History Before 500AD'], dtype=object)

In [11]:
notable["bigperiod_death"].unique()

array(['5.Contemporary period 1901-2020AD', 'Missing',
       '4.Mid Modern Period 1751-1900AD',
       '2.Post-Classical History 501-1500AD',
       '3.Early Modern Period 1501-1750AD',
       '1.Ancient History Before 500AD'], dtype=object)

In [12]:
notable["approx_birth"].unique()

array([nan, 'century', 'circa', 'millenium'], dtype=object)

In [13]:
notable["approx_death"].unique()

array([nan, 'circa', 'century', 'millenium'], dtype=object)

In [14]:
notable["pantheon_1"].unique()

array([0, 1])

##### Check that special characters have been imported correctly

In [15]:
notable.loc[notable["level3_all_occ"] == "D:_economist_P:_pädagoge_reformer_ökonom_German"]

Unnamed: 0_level_0,birth,death,updated_death_date,approx_birth,approx_death,birth_min,birth_max,death_min,death_max,gender,level1_main_occ,name,un_subregion,birth_estimation,death_estimation,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,freq_main_occ,freq_second_occ,level2_second_occ,level3_main_occ,bigperiod_birth,bigperiod_death,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,all_geography_groups,string_citizenship_raw_d,citizenship_1_b,citizenship_2_b,list_areas_of_rattach,area1_of_rattachment,area2_of_rattachment,list_wikipedia_editions,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,pantheon_1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
Q100557,1702.0,1786.0,,,,1702.0,1702.0,1786.0,1786.0,Male,Discovery/Science,Philipp_Ernst_Lüders,Northern Europe,1702.0,1786.0,3.Early Modern Period 1501-1750AD,4.Mid Modern Period 1751-1900AD,748021,Academia,0.8,0.2,Politics,economist,3.Early Modern Period 1501-1750AD,4.Mid Modern Period 1751-1900AD,2165,3,1581,2,6,18.092989,828761.0,Denmark,'Denmark',Denmark,,D:_'Denmark'_can't_verifyB1,Denmark,Missing,dewiki|dawiki,Europe,grB,9.6375,,54.798889,,0,D:_economist_P:_pädagoge_reformer_ökonom_German


In [16]:
notable[["name"]].loc[notable["name"] == "Walter_Schultheiß"]

Unnamed: 0_level_0,name
wikidata_code,Unnamed: 1_level_1
Q100562,Walter_Schultheiß


In [17]:
notable[["level3_all_occ"]].loc[["Q10068"]]

Unnamed: 0_level_0,level3_all_occ
wikidata_code,Unnamed: 1_level_1
Q10068,D:_skier_P:_world cup_ski_racer_English_champion_olympique_coupe du monde_French_sciatrice_olimpico_coppa del mondo_Italian_ ski_läufer_athlet_German_ esquí_campeon_copa del mundo_Spanish_esquiador_copa do mundo_esqui_Portuguese_skidåkare_världscupen_Swedish


## Check for NULL values

In [18]:
#checking no. of NULL values per column

notable.isnull().sum()

birth                        195919
death                       1244507
updated_death_date          2275077
approx_birth                2201780
approx_death                2259756
birth_min                    142965
birth_max                    142965
death_min                   1228132
death_max                   1228132
gender                         1398
level1_main_occ                   0
name                              0
un_subregion                  53567
birth_estimation             112528
death_estimation             112528
bigperiod_birth_graph_b      112528
bigperiod_death_graph_b      112528
curid                             0
level2_main_occ                   0
freq_main_occ                 15417
freq_second_occ             1443910
level2_second_occ                 0
level3_main_occ               15959
bigperiod_birth                   0
bigperiod_death                   0
wiki_readers_2015_2018            0
non_missing_score                 0
total_count_words_b         

In [19]:
#notable.loc[notable["gender"].isnull() == True]

In [20]:
#notable.loc[notable["gender"].isna() == True]

## Drop Columns not necessary for planned Analysis

In [21]:
#notable["approx_birth"].value_counts()

In [22]:
notable.drop(columns=["updated_death_date",
                      "approx_birth", 
                      "approx_death", 
                      "birth_min", 
                      "birth_max", 
                      "death_min", 
                      "death_max", 
                      "birth_estimation", 
                      "death_estimation",
                      "bigperiod_birth",
                      "bigperiod_death",
                      "level2_second_occ",
                      "freq_main_occ",
                      "freq_second_occ",
                      "all_geography_groups",
                      "string_citizenship_raw_d",
                      "citizenship_2_b",
                      "list_areas_of_rattach",
                      "area1_of_rattachment",
                      "area2_of_rattachment",
                      "pantheon_1",
                      "list_wikipedia_editions"
                     ], inplace=True)

## Check Gender Data

In [23]:
#check which unique gender values exist in data

notable["gender"].unique()

array(['Male', 'Female', 'Other', nan], dtype=object)

In [24]:
#check no. of unique gender values

notable["gender"].value_counts()

Male      1901904
Female     387906
Other         609
Name: gender, dtype: int64

In [25]:
notable_gender_other = notable.loc[notable["gender"] == "Other"]

In [26]:
notable_gender_other["birth"].min(), notable_gender_other["birth"].max()

(85.0, 2007.0)

In [27]:
notable_gender_other.loc[notable_gender_other["birth"] == 85.0]

Unnamed: 0_level_0,birth,death,gender,level1_main_occ,name,un_subregion,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,level3_main_occ,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,citizenship_1_b,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Q554387,85.0,160.0,Other,Discovery/Science,Favorinus,Southern Europe,1.Ancient History Before 500AD,1.Ancient History Before 500AD,80268,Academia,philosopher,37719,3,12768,16,22,25.961508,64407.0,Roman_Empire,Europe,grA,4.628611,12.482778,43.676945,41.893055,D:_philosopher_P:_philosopher_English_philosophe_académie_French_filosofo_oratore_Italian_philosoph_akademisch_German_filosof_filosofía_filósof_Spanish


In [28]:
notable.loc[notable["birth"] == .0]

Unnamed: 0_level_0,birth,death,gender,level1_main_occ,name,un_subregion,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,level3_main_occ,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,citizenship_1_b,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Q3427125,0.0,,Female,Culture,Renée_Chevalier,Northern America,1.Ancient History Before 500AD,1.Ancient History Before 500AD,2010745,Culture-core,artist,568,3,2121,1,2,15.801013,1628528.0,Canada,America,grB,-66.683296,,49.099998,,D:_artist_P:_ artiste_artiste_peintre_French
Q40662,0.0,30.0,Male,Leadership,John_the_Baptist,Western Asia (Middle East Caucasus),1.Ancient History Before 500AD,1.Ancient History Before 500AD,16125,Religious,hermit,9751570,3,109542,98,25,35.550228,2705.0,Israel,Asia,grA,35.161995,35.63361,31.766727,31.56694,D:_hermit_prophet_P:_preacher_religious_christianity_English_prédicateur_jésus_French_prediger_judentum_schreiber_German_profeta_corán_predicador_Spanish_pregador_escritor_historiador_Portuguese_profet_son_löpare_Swedish
Q4788796,0.0,,Male,Culture,Arellius,Southern Europe,1.Ancient History Before 500AD,1.Ancient History Before 500AD,27566030,Culture-core,painter,8318,3,2679,2,1,18.71199,877268.0,Italy,Europe,grA,,,,,D:_painter_P:_painter_English_pintor_Spanish
Q53025434,0.0,,Female,Leadership,Yosra_Frawes,North Africa,1.Ancient History Before 500AD,1.Ancient History Before 500AD,11647455,Politics,activist,445,3,2595,1,3,16.048191,1451794.0,Tunisia,Africa,grB,9.936082,,36.850609,,D:_lawyer_activist_feminist_P:_avocat_affaires_militant_French
Q928844,0.0,,Male,Culture,Semimaru,Eastern Asia,1.Ancient History Before 500AD,1.Ancient History Before 500AD,5922372,Culture-core,poet,166912,3,6517,8,5,24.796553,183611.0,Japan,Asia,grA,,,,,D:_poet_P:_player_emperor_English_ poète_musicien_fils_French_músico_poeta_poeta_Spanish
Q939773,0.0,45.0,Female,Leadership,Drusilla_of_Mauretania_the_Elder,West Africa,1.Ancient History Before 500AD,1.Ancient History Before 500AD,74704,Nobility,prince,30257,2,16995,8,2,23.066504,340847.0,Mauritania,Africa,grA,,,,,D:_P:_prince_fille_roi_French_principessa_Italian_princesa_Portuguese


In [29]:
notable["birth"].unique()

array([ 1932.,  1860.,  1971., ..., -1440., -1648.,  -586.])

In [30]:
notable["birth"] = notable["birth"].astype("Int64")

In [31]:
notable["death"] = notable["death"].astype("Int64")

In [32]:
#notable.info()

In [33]:
# princesses get "prince" as occupation

notable.loc[(notable["level3_main_occ"] == "prince") & 
            (notable["gender"] == "Female")
            ].head()

Unnamed: 0_level_0,birth,death,gender,level1_main_occ,name,un_subregion,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,level3_main_occ,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,citizenship_1_b,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,level3_all_occ
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Q100754,1980,,Female,Leadership,Princess_Maria_Theresia_of_Thurn_and_Taxis_(b._1980),Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,25634816,Nobility,prince,134470,3,14071,3,3,24.133657,294030.0,Germany,Europe,grA,12.083333,,49.016666,,D:_P:_publishing_fiancée_prince_English_principe_irmã_editor_Portuguese
Q1030337,1925,1961.0,Female,Leadership,Shigeko_Higashikuni,Eastern Asia,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,20406378,Nobility,prince,662676,3,13299,10,9,27.600048,69831.5,Japan,Asia,grA,139.728607,139.692215,35.68,35.689724,D:_P:_wife of_prince_daughter_English_ fille_empereur_impératrice_French_figlia_imperatore_imperatrice_Italian_ filha_imperador_imperatriz_Portuguese
Q1035346,-251,-228.0,Female,Leadership,Queen_Dowager_Zhao,Eastern Asia,1.Ancient History Before 500AD,1.Ancient History Before 500AD,37062814,Nobility,prince,531328,3,3378,9,1,24.304209,534011.0,China,Asia,grA,,,,,D:_P:_merchant_prince_son_English_drottning_gift med_kung_Swedish
Q105963,1900,1974.0,Female,Leadership,Hélène_Elisabeth_von_Isenburg,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,10538869,Nobility,prince,33959,3,4222,3,2,21.266235,491984.0,Germany,Europe,grB,8.65,6.971,49.866665,51.3265,D:_P:_prince_noble_French
Q1091368,1191,1237.0,Female,Leadership,"Joan,_Lady_of_Wales",Western Europe,2.Post-Classical History 501-1500AD,2.Post-Classical History 501-1500AD,3560163,Nobility,prince,113007,3,16622,14,5,25.853592,133262.0,United_Kingdom,Europe,grA,2.0,-3.633333,47.0,52.349998,D:_prince_P:_wife of_prince_ruler_English_ fille_roi_reine_French_sposato con_Italian_tochter_fürsten_ermittler_German_princesa_señor_esposa_Spanish_ filha_rei_senhor_Portuguese_gift med_furste_äktenskap_Swedish


> Next Steps: 
- look into which columns can be dropped further, e.g. bigperiod_birth/_death columns, areas_of_rattach, pantheon, citizenship, level3_all_occ, ...
- save (then smaller) file
- clean up name column

In [34]:
# check shape of table after dropping columns
notable.shape #reduced no. of columns from 48 to 26

(2291817, 26)

In [35]:
# write new file with reduced no. of columns

# notable.to_csv("../data/notable_people_cross-verified/wiki_notable_people_reduced.csv")