# Gender Bias Analysis of Wikipedia

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

In [2]:
# change display settings to show all dataframe columns as well as complete cell values (no truncating values)
# https://stackoverflow.com/questions/47022070/display-all-dataframe-columns-in-a-jupyter-python-notebook
pd.set_option('display.max_columns', None)

# https://stackoverflow.com/a/51540918
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## Data Source

- **Project**: [A cross-verified database of notable people, 3500BC-2018AD](https://www.nature.com/articles/s41597-022-01369-4)
    - Paper: [Shareable Link](https://rdcu.be/c6YvW)
- **Download**: [A Brief History of Human Time - Cross-verified Dataset](https://data.sciencespo.fr/dataset.xhtml?persistentId=doi:10.21410/7E4/RDAG3O)

Includes data until 2018.

## Read Data

In [1]:
#!ls ../data/notable_people_cross-verified/

In [4]:
notable = pd.read_csv("../data/notable_people_cross-verified/wiki_notable_people_cleaned_iso.csv", 
                      encoding='utf-8', index_col=0)

In [5]:
notable.head(3)

Unnamed: 0_level_0,birth,death,gender,occup_l1,name,un_subregion,bigperiod_birth,bigperiod_death,curid,occup_l2,occup_l3,avg_no_readers_2015_2018,non_missing_score,total_count_words,no_wiki_editions,no_external_links,notability_index_sum,notability_index_ranking,citizenship,un_region,group_wikipedia_editions,birth_place_lon,death_place_lon,birth_place_lat,death_place_lat,occup_l3_all,map_iso3
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Q1000002,1932.0,1990.0,Male,Culture,Claus Hammel,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2949539,Culture-core,playwright,1669,3,1777,1,11,18.083672,1058542.0,Germany,Europe,grB,11.833333,12.42,53.416668,54.38139,D:_playwright_journalist_writer_screenwriter_P:_ drama_dramatiker_German,DEU
Q1000005,1860.0,1927.0,Male,Culture,Karel Matěj Čapek-Chod,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,4217319,Culture-core,writer,25008,3,6491,9,15,23.98061,131428.0,Czech Republic,Europe,grA,12.929798,14.421389,49.440605,50.087502,D:_writer_journalist_P:_naturalist_writer_journalist_English_ scrittore_Italian_ schriftsteller_journalist_vertreter_German_ författare_författare_Swedish,CZE
Q1000006,1971.0,,Male,Culture,Florian Eichinger,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,5050967,Culture-core,film,27285,3,1573,1,10,20.666656,775768.0,Germany,Europe,grB,9.191944,,48.897499,,D:_film_screenwriter_film_P:_regisseur_autor_film_German,DEU


In [6]:
#notable.info()

In [7]:
#notable.isnull().sum()

In [8]:
# transform years into integer

notable["birth"] = notable["birth"].astype("Int64")

In [9]:
notable["death"] = notable["death"].astype("Int64")

In [10]:
notable.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2291817 entries, Q1000002 to Q999999
Data columns (total 27 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   birth                     Int64  
 1   death                     Int64  
 2   gender                    object 
 3   occup_l1                  object 
 4   name                      object 
 5   un_subregion              object 
 6   bigperiod_birth           object 
 7   bigperiod_death           object 
 8   curid                     int64  
 9   occup_l2                  object 
 10  occup_l3                  object 
 11  avg_no_readers_2015_2018  int64  
 12  non_missing_score         int64  
 13  total_count_words         int64  
 14  no_wiki_editions          int64  
 15  no_external_links         int64  
 16  notability_index_sum      float64
 17  notability_index_ranking  float64
 18  citizenship               object 
 19  un_region                 object 
 20  group_wikipedia_editio

In [11]:
#notable.isnull().sum()

In [12]:
notable.head(3)

Unnamed: 0_level_0,birth,death,gender,occup_l1,name,un_subregion,bigperiod_birth,bigperiod_death,curid,occup_l2,occup_l3,avg_no_readers_2015_2018,non_missing_score,total_count_words,no_wiki_editions,no_external_links,notability_index_sum,notability_index_ranking,citizenship,un_region,group_wikipedia_editions,birth_place_lon,death_place_lon,birth_place_lat,death_place_lat,occup_l3_all,map_iso3
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Q1000002,1932,1990.0,Male,Culture,Claus Hammel,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,2949539,Culture-core,playwright,1669,3,1777,1,11,18.083672,1058542.0,Germany,Europe,grB,11.833333,12.42,53.416668,54.38139,D:_playwright_journalist_writer_screenwriter_P:_ drama_dramatiker_German,DEU
Q1000005,1860,1927.0,Male,Culture,Karel Matěj Čapek-Chod,Western Europe,4.Mid Modern Period 1751-1900AD,5.Contemporary period 1901-2020AD,4217319,Culture-core,writer,25008,3,6491,9,15,23.98061,131428.0,Czech Republic,Europe,grA,12.929798,14.421389,49.440605,50.087502,D:_writer_journalist_P:_naturalist_writer_journalist_English_ scrittore_Italian_ schriftsteller_journalist_vertreter_German_ författare_författare_Swedish,CZE
Q1000006,1971,,Male,Culture,Florian Eichinger,Western Europe,5.Contemporary period 1901-2020AD,5.Contemporary period 1901-2020AD,5050967,Culture-core,film,27285,3,1573,1,10,20.666656,775768.0,Germany,Europe,grB,9.191944,,48.897499,,D:_film_screenwriter_film_P:_regisseur_autor_film_German,DEU


In [13]:
notable["bigperiod_birth"].unique()

array(['5.Contemporary period 1901-2020AD',
       '4.Mid Modern Period 1751-1900AD',
       '2.Post-Classical History 501-1500AD',
       '3.Early Modern Period 1501-1750AD',
       '1.Ancient History Before 500AD', nan], dtype=object)

In [14]:
# who are the 2 Wikipedia-notable individuals that already identified with "Other" gender in ancient period of time?

notable.loc[(notable["gender"] == "Other") & (notable["bigperiod_birth"] == "1.Ancient History Before 500AD")]

Unnamed: 0_level_0,birth,death,gender,occup_l1,name,un_subregion,bigperiod_birth,bigperiod_death,curid,occup_l2,occup_l3,avg_no_readers_2015_2018,non_missing_score,total_count_words,no_wiki_editions,no_external_links,notability_index_sum,notability_index_ranking,citizenship,un_region,group_wikipedia_editions,birth_place_lon,death_place_lon,birth_place_lat,death_place_lat,occup_l3_all,map_iso3
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Q484609,,,Other,Culture,Analatos Painter,Southern Europe,1.Ancient History Before 500AD,1.Ancient History Before 500AD,17771787,Culture-core,painter,14543,3,9106,7,5,22.573139,254434.0,Greece,Europe,grA,,,,,D:_painter_painter_P:_painter_potter_English_archeologico_Italian_archäologisch_gang_maler_German_pintor_alfarer_farero_Spanish,GRC
Q554387,85.0,160.0,Other,Discovery/Science,Favorinus,Southern Europe,1.Ancient History Before 500AD,1.Ancient History Before 500AD,80268,Academia,philosopher,37719,3,12768,16,22,25.961508,64407.0,Roman Empire,Europe,grA,4.628611,12.482778,43.676945,41.893055,D:_philosopher_P:_philosopher_English_philosophe_académie_French_filosofo_oratore_Italian_philosoph_akademisch_German_filosof_filosofía_filósof_Spanish,ITA


## Top 10 Wikipedia Notable People

In [13]:
occupation = notable[["notability_index_ranking", 
                      "gender", "name", 
                      "occup_l3", "occup_l1",
                      "birth", "citizenship"]
                    ].sort_values(by="notability_index_ranking", ascending=True)
occupation.head()

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q76,1.0,Male,Barack Obama,politician,Leadership,1961,US
Q22686,2.0,Male,Donald Trump,magnate,Leadership,1946,US
Q762,3.0,Male,Leonardo da Vinci,polymath,Discovery/Science,1452,Italy
Q352,4.0,Male,Adolf Hitler,politician,Leadership,1889,Austria
Q937,5.0,Male,Albert Einstein,physicist,Discovery/Science,1879,Germany


In [14]:
# remove .0 from ranking values for better readability

occupation["notability_index_ranking"] = occupation["notability_index_ranking"].round().astype(int)

###  – by Gender

In [15]:
# which unique gender values exist

notable["gender"].unique()

array(['Male', 'Female', 'Other', nan], dtype=object)

In [16]:
# Top 10 Males

occupation.loc[occupation["gender"] == "Male"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q76,1,Male,Barack Obama,politician,Leadership,1961,US
Q22686,2,Male,Donald Trump,magnate,Leadership,1946,US
Q762,3,Male,Leonardo da Vinci,polymath,Discovery/Science,1452,Italy
Q352,4,Male,Adolf Hitler,politician,Leadership,1889,Austria
Q937,5,Male,Albert Einstein,physicist,Discovery/Science,1879,Germany
Q2831,6,Male,Michael Jackson,dancer,Culture,1958,US
Q8016,7,Male,Winston Churchill,politician,Leadership,1874,United Kingdom
Q692,8,Male,William Shakespeare,playwright,Culture,1564,United Kingdom
Q517,9,Male,Napoleon,politician,Leadership,1769,France
Q303,10,Male,Elvis Presley,actor,Culture,1935,US


In [17]:
# Top 10 Females

occupation.loc[occupation["gender"] == "Female"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q6294,25,Female,Hillary Clinton,politician,Leadership,1947,US
Q1744,32,Female,Madonna,singer,Culture,1958,US
Q4616,34,Female,Marilyn Monroe,actor,Culture,1926,US
Q19848,58,Female,Lady Gaga,singer,Culture,1986,US
Q11975,59,Female,Britney Spears,singer,Culture,1981,US
Q7186,66,Female,Marie Curie,physicist,Discovery/Science,1867,Poland
Q13909,68,Female,Angelina Jolie,actor,Culture,1975,US
Q7207,71,Female,Elizabeth I of England,queen,Leadership,1533,United Kingdom
Q9439,72,Female,Queen Victoria,queen,Leadership,1819,United Kingdom
Q36844,76,Female,Rihanna,singer,Culture,1988,Barbados


In [18]:
# Top 10 Other

occupation.loc[occupation["gender"] == "Other"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q365144,1923,Other,Caitlyn Jenner,olympic,Sports/Games,1949,US
Q194045,2145,Other,Steven Tyler,singer,Culture,1948,US
Q15123969,3001,Other,Sam Smith,singer,Culture,1992,United Kingdom
Q1140735,8650,Other,Lili Elbe,painter,Culture,1882,Denmark
Q201204,8721,Other,Dana International,singer,Culture,1969,Israel
Q254431,9293,Other,Alexis Arquette,actor,Culture,1969,US
Q44641,9528,Other,Wendy Carlos,composer,Culture,1939,US
Q464357,9873,Other,Jake Zyrus,singer,Culture,1992,Philippines
Q982182,10863,Other,Richard O'Brien,actor,Culture,1942,United Kingdom
Q195691,11452,Other,Amandla Stenberg,actor,Culture,1998,US


### – by Occupation

In [19]:
# which unique occupation level 1 values exist

notable["occup_l1"].unique()

array(['Culture', 'Leadership', 'Discovery/Science', 'Sports/Games',
       'Other', 'Missing'], dtype=object)

In [20]:
# Top 10 Culture

occupation.loc[occupation["occup_l1"] == "Culture"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q2831,6,Male,Michael Jackson,dancer,Culture,1958,US
Q692,8,Male,William Shakespeare,playwright,Culture,1564,United Kingdom
Q303,10,Male,Elvis Presley,actor,Culture,1935,US
Q5582,13,Male,Vincent van Gogh,painter,Culture,1853,Netherlands
Q254,14,Male,Wolfgang Amadeus Mozart,composer,Culture,1756,Holy Roman Empire
Q255,17,Male,Ludwig van Beethoven,composer,Culture,1770,Germany
Q5593,18,Male,Pablo Picasso,painter,Culture,1881,Spain
Q9960,20,Male,Ronald Reagan,actor,Culture,1911,US
Q882,23,Male,Charlie Chaplin,film,Culture,1889,United Kingdom
Q1203,27,Male,John Lennon,producer,Culture,1940,United Kingdom


In [21]:
# Top 10 Leadership

occupation.loc[occupation["occup_l1"] == "Leadership"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q76,1,Male,Barack Obama,politician,Leadership,1961,US
Q22686,2,Male,Donald Trump,magnate,Leadership,1946,US
Q352,4,Male,Adolf Hitler,politician,Leadership,1889,Austria
Q8016,7,Male,Winston Churchill,politician,Leadership,1874,United Kingdom
Q517,9,Male,Napoleon,politician,Leadership,1769,France
Q1001,11,Male,Mahatma Gandhi,politician,Leadership,1869,India
Q8023,15,Male,Nelson Mandela,politician,Leadership,1918,South Africa
Q91,18,Male,Abraham Lincoln,politician,Leadership,1809,US
Q5809,24,Male,Che Guevara,politician,Leadership,1928,Argentina
Q6294,25,Female,Hillary Clinton,politician,Leadership,1947,US


In [22]:
# Top 10 Discovery/Science

occupation.loc[occupation["occup_l1"] == "Discovery/Science"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q762,3,Male,Leonardo da Vinci,polymath,Discovery/Science,1452,Italy
Q937,5,Male,Albert Einstein,physicist,Discovery/Science,1879,Germany
Q1035,12,Male,Charles Darwin,biologist,Discovery/Science,1809,United Kingdom
Q9061,16,Male,Karl Marx,economist,Discovery/Science,1818,Germany
Q9215,21,Male,Sigmund Freud,psychoanalyst,Discovery/Science,1856,Austria
Q9554,22,Male,Martin Luther,linguist,Discovery/Science,1483,Holy Roman Empire
Q868,26,Male,Aristotle,biologist,Discovery/Science,-383,Greece
Q307,28,Male,Galileo Galilei,astronomer,Discovery/Science,1564,Italy
Q859,36,Male,Plato,philosopher,Discovery/Science,-427,Greece
Q9358,37,Male,Friedrich Nietzsche,philosopher,Discovery/Science,1844,Germany


In [23]:
# Top 10 Sports/Games

occupation.loc[occupation["occup_l1"] == "Sports/Games"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q36107,61,Male,Muhammad Ali,boxer,Sports/Games,1942,US
Q615,88,Male,Lionel Messi,football,Sports/Games,1987,Argentina
Q10520,136,Male,David Beckham,football,Sports/Games,1975,United Kingdom
Q41421,165,Male,Michael Jordan,basket,Sports/Games,1963,US
Q11571,182,Male,Cristiano Ronaldo,football,Sports/Games,1985,Portugal
Q12897,200,Male,Pelé,football,Sports/Games,1940,Brazil
Q17515,223,Male,Diego Maradona,football,Sports/Games,1960,Argentina
Q46896,256,Male,Zlatan Ibrahimović,football,Sports/Games,1981,Sweden
Q1835,256,Male,Zinedine Zidane,football,Sports/Games,1972,France
Q142794,277,Male,Neymar,football,Sports/Games,1992,Brazil


In [24]:
# Top 10 Other

occupation.loc[occupation["occup_l1"] == "Other"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q1317,436,Male,Osama bin Laden,terrorista,Other,1957,Saudi Arabia
Q13424289,537,Male,Edward Snowden,employee,Other,1983,US
Q191103,1333,Female,Lucrezia Borgia,daughter,Other,1480,Italy
Q80048,1515,Male,Al Capone,gangster,Other,1899,US
Q49086,1793,Male,Simon Wiesenthal,survivor,Other,1908,Austria
Q485508,1804,Male,Charles Manson,criminal,Other,1934,US
Q48745,2450,Male,Lee Harvey Oswald,killer,Other,1939,US
Q191039,2956,Male,Germanicus,son,Other,-15,Roman Empire
Q206191,3074,Female,Abigail Adams,mother_of,Other,1744,United Kingdom
Q44200,3134,Male,Billy the Kid,outlaw,Other,1859,US


In [25]:
# Top 10 Missing

occupation.loc[occupation["occup_l1"] == "Missing"].head(10)

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q442467,65516,Female,María Sabina,,Missing,1894,Mexico
Q708522,124229,Male,Liu Zhiyuan,,Missing,895,China
Q1008018,127972,Male,Dahui Zonggao,,Missing,1089,China
Q1639665,175622,Male,Jassa Singh Ahluwalia,,Missing,1718,
Q3917678,201966,Male,Jeremiah Brandreth,,Missing,1790,United Kingdom
Q642993,205964,Male,Shuja-ud-Daula,,Missing,1732,India
Q998953,209974,Male,Marco Feingold,,Missing,1913,Austria
Q737058,214232,Male,Tāmati Wāka Nene,,Missing,1785,New Zealand
Q4920344,214579,Male,Vajirananavarorasa,,Missing,1860,Thailand
Q453714,226172,Male,Thomas Hobson,,Missing,1544,United Kingdom


In [26]:
# find Angela Merkel

occupation.loc[occupation["name"] == "Angela Merkel"]

Unnamed: 0_level_0,notability_index_ranking,gender,name,occup_l3,occup_l1,birth,citizenship
wikidata_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q567,113,Female,Angela Merkel,politician,Leadership,1954,Germany
