In [101]:
import numpy as np
import pandas as pd

In [102]:
df = pd.read_parquet("speaker_attributes.parquet")

In [103]:
df.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


### Here is a quick exemple on how to search/filter for a specific politician, knowing boxes contain lists, not only strings.

In [104]:
# Searching for a specific politician. Just replace the name here. Note : it is case sensitive by default. 
# Use either what is present in the aliases field or the label field. 

mask = df["label"].str.contains("Bush", regex = False, na = False)

# Change number of desired results here 
df[mask][:10]

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
11351,[Vanevar Bush],[+1890-03-11T00:00:00Z],[Q30],[Q6581097],1392187290,,,"[Q82594, Q205375, Q82955, Q1622272, Q81096]",,,Q299595,Vannevar Bush,,item,[Q55004488]
12327,[Prescott Sheldon Bush],[+1895-05-15T00:00:00Z],[Q30],[Q6581097],1393428512,[Q7435494],B001167,"[Q82955, Q806798, Q4416090]",[Q29468],,Q324742,Prescott Bush,,item,[Q682443]
16876,,[+1955-12-24T00:00:00Z],[Q30],[Q6581097],1388311017,[Q49085],,"[Q10800557, Q10798782]",,,Q452552,Grand L. Bush,,item,
17154,,[+1984-06-25T00:00:00Z],[Q30],[Q6581072],1392040371,,,"[Q4610556, Q33999, Q3501317]",,,Q456169,Lauren Bush,,item,
20670,,[+1970-09-20T00:00:00Z],[Q30],[Q6581072],1391193823,[Q49085],,"[Q33999, Q10800557, Q10798782]",,,Q536025,N'Bushe Wright,,item,
44750,"[Marvin Bush, Marvin Pierce Bush]",[+1956-10-22T00:00:00Z],[Q30],[Q6581097],1311310452,,,[Q43845],,,Q1375345,Marvin P. Bush,,item,
44772,[Samuel Prescott Bush],[+1863-10-04T00:00:00Z],[Q30],[Q6581097],1373387625,,,"[Q43845, Q131524]",,,Q1376227,Samuel P. Bush,,item,
45548,,[+1869-01-20T00:00:00Z],[Q30],[Q6581097],1323621663,,,"[Q1622272, Q2504617]",,,Q1394511,Albert Bushnell Johnson,,item,
52576,,[+1921-12-16T00:00:00Z],[Q183],[Q6581097],1329921163,,,"[Q9385011, Q1622272]",,,Q1592275,Karl-August Bushe,,item,


## Summary : available fields and infos 
- Each person is identified by an (unique? check if any duplicate) id (column id), matching the one on wikipedia
- Label contains one specific label for this person, as it appears in wikipedia's url. 
- Aliases contain the most used names to refer to this person. Can be empty if no other than the one indicated in labels. 
- Infos are either none, or the reference of a QID article. Mostly, these references are created when a link to a referenced wikipedia elements is added in a person's wikipedia page. 
- Occupation QIDs can be relevant for us : ex : Q82955 : Politician, Q189290 : Military officier, Q39631 : Physician (medecine), Q30461 : President... 
- Political party is also given, if any. For congresspersons, their corresponding ID is also given. 
- No time distinction is available : ex : If a person changed its political party (or any other attribute available here) during his life, both political parties will be indicated. 

## Issues : missing data
- Some data are clearly missing above. George W. Bush may not be the smartest person alive, but he still holds a couple of academic degrees. For some people the degree is simply indicated as text (and not a hyperlink) in the wikipedia page, but it's not the case for all. George's biography clearly states a Bachelor of Arts as a hyperlink although we can not find it in the data set (maybe a timing issue ?) 

In [105]:
# We don't really care about people born before 1900, so we discard them 
# or do we ? can dead people influence elections ?
# In real project we may want to convert the field to DateTime and filter even more for not dead people 

# convert the array of strings to a integer year. We keep only the year, for those born after 1900 
def filter_date_1900(x):
    if x is not None:
        # we don't care about those born in before J.C (- as first character) or before 1900
        if (x[0])[0] == "+" and int((x[0])[1:5]) >= 1900 :
            return int((x[0])[1:5])
        else: 
            return None

df["date_of_birth"] = df["date_of_birth"].apply(lambda x: filter_date_1900(x))
# we discard the na value. Note that this could also discard a few people that simply did not had their date of birth in wikidata, 
# but may be born afterwards. 

df = df.dropna(subset = ["date_of_birth"])
df.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",1952.0,[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",1946.0,[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
5,"[Augusto Pinochet Ugarte, Augusto José Ramón P...",1915.0,[Q298],[Q6581097],1392242213,,,"[Q189290, Q82955]",[Q327591],,Q368,Augusto Pinochet,,item,[Q1841]
8,"[Neil Percival Young, Shakey, Godfather of Gru...",1945.0,"[Q16, Q30]",[Q6581097],1395459626,,,"[Q177220, Q488205, Q2526255, Q639669, Q1881462...",,,Q633,Neil Young,,item,
9,,1969.0,[Q183],[Q6581097],1340253739,,,"[Q33231, Q41546637]",,,Q640,Harald Krichel,,item,


In [106]:
# Proportion of no academic degree or missing academic degrees. 
no_academic_found = df[df["academic_degree"].isnull()].size 
print("The proportion of people with no academic degree found is", no_academic_found/df.size)

The proportion of people with no academic degree found is 0.981334501646023


In [107]:
# Proportion of missing occupations. 

no_occupation_found = df[df["occupation"].isnull()].size 
print("The proportion of people with no occupation found is", no_occupation_found/df.size)

The proportion of people with no occupation found is 0.16614569278172453


There is clearly some issue with the academic degrees, we may not want to use this field. In terms of occupation, "only" 16% are missing. Here is a small exemple of such cases. 

In [108]:
df[df["occupation"].isnull()].head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
72,,1919.0,,[Q6581097],1390158852,,,,,,Q4291,Ante Bilobrk,,item,
545,,1958.0,,[Q6581097],1309589591,[Q127885],,,,,Q29830,Vukašin Šoškoćanin,,item,
604,"[Greg DePalma, Gregory J. DePalma]",1932.0,,[Q6581097],1309589720,,,,,,Q33371,Gregory DePalma,,item,
1030,,1924.0,,[Q6581097],1390159368,,,,,,Q53595,Pranas Brazinskas,,item,
2289,,1922.0,[Q15180],[Q6581097],1327065855,,,,[Q79854],,Q73926,Oleg Goncharenko,,item,
