I used the parquet file with the QIDs (below) to get all rows with American politicians. So this notebook creates a dataframe with about 22000 politicians with information like gender, ethnicity, party, academic degree, etc. I used the label column from this dataframe with the new keywords Naomie added to get dataframes for each year from the quotes files. I put them in a google drive because some of them are too big for github (they can probably be filtered a lot more). I figured that this way we could do more general analyses with a lot more quotes. I also created a politicians2.csv with some more politicians labeled by their state. For the state analyses, we can merge that with the df15, df16, etc. 

So in the google drive:
* american_politicians.csv : American politicians with their QID information (the dataframe in this notebook)

* df15.csv, df16.csv, df17.csv, df18.csv, df19.csv, df20.csv: dataframes with relevant quotes from all American politicians in the parquet file

* politicians2.csv: an updated politicians file with politicians that I could find who were labeled by state


In [2]:
import numpy as np
import pandas as pd

In [1]:
parquetdf = pd.read_parquet('../Downloads/speaker_attributes.parquet')
parquetdf

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9055976,[Barker Howard],,[Q30],[Q6581097],1397399351,,,[Q82955],,,Q106406560,Barker B. Howard,,item,
9055977,[Charles Macomber],,[Q30],[Q6581097],1397399471,,,[Q82955],,,Q106406571,Charles H. Macomber,,item,
9055978,,[+1848-04-01T00:00:00Z],,[Q6581072],1397399751,,,,,,Q106406588,Dina David,,item,
9055979,,[+1899-03-18T00:00:00Z],,[Q6581072],1397399799,,,,,,Q106406593,Irma Dexinger,,item,


In [4]:
# Extract all rows with American politicians
parquetdf2 = parquetdf[~parquetdf.occupation.isnull() & ~parquetdf.nationality.isnull()]
try1 = parquetdf2.apply(lambda x: 'Q30' in x['nationality'], axis=1)  # 'Q30' corresponds to American nationality
try2 = parquetdf2.apply(lambda x: 'Q82955' in x['occupation'], axis=1)  # 'Q82955' corresponds to politicians
try3 = np.logical_and(try1, try2)

In [5]:
americanpoliticians = parquetdf2[try3].reset_index(drop=True)
americanpoliticians

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
2,"[Elisha Meredith, Elisha Edward Meredith]",[+1848-12-26T00:00:00Z],[Q30],[Q6581097],1395914283,,M000647,"[Q82955, Q40348]",[Q29552],,Q3251,Elisha E. Meredith,,item,
3,"[Willard Mitt Romney, Pierre Delecto]",[+1947-03-12T00:00:00Z],[Q30],[Q6581097],1393565531,,R000615,"[Q82955, Q15978655, Q43845, Q15980158, Q219477]",[Q29468],"[Q1765120, Q191701, Q1540185]",Q4496,Mitt Romney,"[Q937607, Q4226, Q4791860, Q17100322]",item,[Q42504]
4,"[Richard Milhous Nixon, Nixon, President Nixon...",[+1913-01-09T00:00:00Z],[Q30],[Q6581097],1393886022,,N000116,"[Q82955, Q189290, Q40348, Q18814623, Q372436]",[Q29468],,Q9588,Richard Nixon,"[Q7891408, Q693742, Q699590, Q644161, Q1701660...",item,[Q170208]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55774,[Leonard Gaskill],,[Q30],[Q6581097],1397399268,,,[Q82955],,,Q106406546,Leonard T. Gaskill,,item,
55775,[Andrew Healy],,[Q30],[Q6581097],1397399333,,,[Q82955],,,Q106406557,Andrew F. Healy,,item,
55776,[Barker Howard],,[Q30],[Q6581097],1397399351,,,[Q82955],,,Q106406560,Barker B. Howard,,item,
55777,[Charles Macomber],,[Q30],[Q6581097],1397399471,,,[Q82955],,,Q106406571,Charles H. Macomber,,item,


In [6]:
# Get rid of politicians who were born before 1935
americansnonnull = americanpoliticians[~americanpoliticians.date_of_birth.isnull()]
year = americansnonnull.apply(lambda x: x['date_of_birth'][0][1:5], axis=1)
year

0        1732
1        1946
2        1848
3        1947
4        1913
         ... 
55756    1820
55765    1926
55766    1835
55768    1847
55769    1836
Length: 51033, dtype: object

In [7]:
year = year.astype(int)
todelete = year[year < 1935].index
americanpoliticians = americanpoliticians.drop(todelete)
americanpoliticians

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
1,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
3,"[Willard Mitt Romney, Pierre Delecto]",[+1947-03-12T00:00:00Z],[Q30],[Q6581097],1393565531,,R000615,"[Q82955, Q15978655, Q43845, Q15980158, Q219477]",[Q29468],"[Q1765120, Q191701, Q1540185]",Q4496,Mitt Romney,"[Q937607, Q4226, Q4791860, Q17100322]",item,[Q42504]
5,[Stephen Gerald Breyer],[+1938-08-15T00:00:00Z],[Q30],[Q6581097],1393110898,,,"[Q185351, Q16533, Q40348, Q1622272, Q82955]",[Q29552],,Q11124,Stephen Breyer,,item,[Q9268]
6,[David Alexander Paterson],[+1954-05-20T00:00:00Z],[Q30],[Q6581097],1392736069,[Q49085],,[Q82955],[Q29552],,Q11674,David Paterson,,item,
11,"[James Warren ""Jim"" DeMint, James Warren DeMint]",[+1951-09-02T00:00:00Z],[Q30],[Q6581097],1392634298,,D000595,"[Q82955, Q2961975]",[Q29468],,Q22201,Jim DeMint,,item,[Q178169]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55774,[Leonard Gaskill],,[Q30],[Q6581097],1397399268,,,[Q82955],,,Q106406546,Leonard T. Gaskill,,item,
55775,[Andrew Healy],,[Q30],[Q6581097],1397399333,,,[Q82955],,,Q106406557,Andrew F. Healy,,item,
55776,[Barker Howard],,[Q30],[Q6581097],1397399351,,,[Q82955],,,Q106406560,Barker B. Howard,,item,
55777,[Charles Macomber],,[Q30],[Q6581097],1397399471,,,[Q82955],,,Q106406571,Charles H. Macomber,,item,


In [8]:
americanpoliticians = americanpoliticians.reset_index(drop=True)
americanpoliticians.shape

(21897, 15)

In [None]:
americanpoliticians.to_csv("data/american_politicians.csv")