# MPs data


In [1]:
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

tqdm.pandas()

In [2]:
def parse_person_json(json_dict):
    # Every entry has an ID
    person_id = json_dict['id']

    # Sometimes, an entry has no name information
    # In these cases, the entry just serves as a redirect to another entry
    if 'other_names' not in json_dict.keys():
        # Save redirect IDs to resolve later
        redirect_id = json_dict['redirect']
    else:
        redirect_id = None

    if 'other_names' in json_dict.keys():
        # Often people will have multiple name entries, stored as separate dicts
        # e.g. Tony Blair has an entry as 'Anthony Blair', another as 'Tony Blair', and one as 'The Prime Minister'
        # We want to collapse these down to one name dictionary
        name_dict = {
            k: v for name_dict in json_dict['other_names'] for k, v in name_dict.items()}
        if 'given_name' in name_dict.keys():  # Extract the first/given name
            first_name = name_dict['given_name']
        else:
            first_name = None
        if 'family_name' in name_dict.keys():  # And the surname/family name
            family_name = name_dict['family_name']
        else:
            family_name = None
    else:
        first_name, family_name = None, None

    return person_id, redirect_id, first_name, family_name

In [4]:
url = 'https://raw.githubusercontent.com/mysociety/parlparse/master/members/people.json'

response = requests.get(url)
response_json = response.json()

people_json_list = response_json['persons']

people_dict = {'person_id': [],
               'redirect_id': [],
               'first_name': [],
               'family_name': []}

for person_json in tqdm(people_json_list):
    person_id, redirect_id, first_name, family_name = parse_person_json(
        person_json)
    for variable in people_dict.keys():
        people_dict[variable].append(eval(variable))

people_df = pd.DataFrame(people_dict)
display(people_df)

100%|██████████| 14224/14224 [00:00<00:00, 48837.61it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name
0,uk.org.publicwhip/person/10001,,Diane,Abbott
1,uk.org.publicwhip/person/10002,,Gerry,Adams
2,uk.org.publicwhip/person/10003,,Irene,Adams
3,uk.org.publicwhip/person/10004,,Nick,Ainger
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth
...,...,...,...,...
14219,uk.org.publicwhip/person/26280,,Sue,
14220,uk.org.publicwhip/person/26281,,Richard,
14221,uk.org.publicwhip/person/26282,,Philip,
14222,uk.org.publicwhip/person/26283,,Ralph,


In [5]:
url = 'https://raw.githubusercontent.com/mysociety/parlparse/master/members/dates-of-birth.xml'

response = requests.get(url)
response_xml = response.content.decode()

bs = BeautifulSoup(response_xml)

dob_dict = {'person_id': [],
            'dob': []}

for info in bs.find_all('personinfo'):
    person_id = info['id']
    dob = pd.to_datetime(info['date_of_birth'])

    for variable in dob_dict.keys():
        dob_dict[variable].append(eval(variable))

dob_df = pd.DataFrame(dob_dict)
people_df.merge(dob_df, how='left')

Unnamed: 0,person_id,redirect_id,first_name,family_name,dob
0,uk.org.publicwhip/person/10001,,Diane,Abbott,1953-09-27
1,uk.org.publicwhip/person/10002,,Gerry,Adams,1948-10-06
2,uk.org.publicwhip/person/10003,,Irene,Adams,NaT
3,uk.org.publicwhip/person/10004,,Nick,Ainger,1949-10-24
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,1952-06-19
...,...,...,...,...,...
14219,uk.org.publicwhip/person/26280,,Sue,,NaT
14220,uk.org.publicwhip/person/26281,,Richard,,NaT
14221,uk.org.publicwhip/person/26282,,Philip,,NaT
14222,uk.org.publicwhip/person/26283,,Ralph,,NaT


In [6]:
# Also, in some editions of Hansard, MPs are referred to by their member_id rather than their person_id
# We want to therefore list all of the member_ids linked with each person_id
def person_id_2_member_id(person_id): return [
    entry['id'] for entry in response_json['memberships'] if 'person_id' in entry.keys() if entry['person_id'] == person_id]


people_df['memberships'] = people_df.person_id.progress_apply(
    person_id_2_member_id)
display(people_df)

100%|██████████| 14224/14224 [01:23<00:00, 169.68it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name,memberships
0,uk.org.publicwhip/person/10001,,Diane,Abbott,"[uk.org.publicwhip/member/2069, uk.org.publicw..."
1,uk.org.publicwhip/person/10002,,Gerry,Adams,"[uk.org.publicwhip/member/2196, uk.org.publicw..."
2,uk.org.publicwhip/person/10003,,Irene,Adams,"[uk.org.publicwhip/member/2201, uk.org.publicw..."
3,uk.org.publicwhip/person/10004,,Nick,Ainger,"[uk.org.publicwhip/member/2321, uk.org.publicw..."
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,"[uk.org.publicwhip/member/2323, uk.org.publicw..."
...,...,...,...,...,...
14219,uk.org.publicwhip/person/26280,,Sue,,[uk.org.publicwhip/lord/101744]
14220,uk.org.publicwhip/person/26281,,Richard,,[uk.org.publicwhip/lord/101746]
14221,uk.org.publicwhip/person/26282,,Philip,,[uk.org.publicwhip/lord/101748]
14222,uk.org.publicwhip/person/26283,,Ralph,,[uk.org.publicwhip/lord/101752]


In [7]:
# Then, we can use memberships to link people to post IDs
memberid2postid = {entry['id']: (entry['post_id'], entry['start_date'], entry['end_date']) for entry in response_json['memberships'] if all([
    key in entry.keys() for key in ['post_id', 'start_date', 'end_date']])}

# And the same JSON links post IDs to constituency names
mp_posts = [entry for entry in response_json['posts']
            if 'role' in entry.keys() if entry['role'] == 'Member of Parliament']
postid2constituency = {entry['id']: entry['area']['name']
                       for entry in mp_posts}

# So then we can lookup constituency names from member IDs
memberid2constituency = {member_id: ((start_date, end_date), postid2constituency[post_id]) for member_id, (
    post_id, start_date, end_date) in memberid2postid.items() if post_id in postid2constituency.keys()}


def member_id_2_constituency(memberships):
    member_ids_to_lookup = []
    for member_id in memberships:
        if member_id in memberid2constituency.keys():
            member_ids_to_lookup.append(member_id)

    constituencies = dict()
    for member_id in member_ids_to_lookup:
        dates, constituency = memberid2constituency[member_id]
        constituencies[dates] = constituency
    return constituencies


people_df['constituencies'] = people_df.memberships.progress_apply(
    member_id_2_constituency)
display(people_df)

100%|██████████| 14224/14224 [00:00<00:00, 478422.63it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name,memberships,constituencies
0,uk.org.publicwhip/person/10001,,Diane,Abbott,"[uk.org.publicwhip/member/2069, uk.org.publicw...","{('1987-06-11', '1992-03-16'): 'Hackney North ..."
1,uk.org.publicwhip/person/10002,,Gerry,Adams,"[uk.org.publicwhip/member/2196, uk.org.publicw...","{('1983-06-09', '1987-05-18'): 'Belfast West',..."
2,uk.org.publicwhip/person/10003,,Irene,Adams,"[uk.org.publicwhip/member/2201, uk.org.publicw...","{('1990-11-29', '1992-03-16'): 'Paisley North'..."
3,uk.org.publicwhip/person/10004,,Nick,Ainger,"[uk.org.publicwhip/member/2321, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Pembroke', ('1..."
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,"[uk.org.publicwhip/member/2323, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Coventry North..."
...,...,...,...,...,...,...
14219,uk.org.publicwhip/person/26280,,Sue,,[uk.org.publicwhip/lord/101744],{}
14220,uk.org.publicwhip/person/26281,,Richard,,[uk.org.publicwhip/lord/101746],{}
14221,uk.org.publicwhip/person/26282,,Philip,,[uk.org.publicwhip/lord/101748],{}
14222,uk.org.publicwhip/person/26283,,Ralph,,[uk.org.publicwhip/lord/101752],{}


In [8]:
# Similarly, we can get details on party affiliations across periods of time
memberid2partyid = {entry['id']: (entry['on_behalf_of_id'], entry['start_date'], entry['end_date']) for entry in response_json['memberships'] if all([
    key in entry.keys() for key in ['on_behalf_of_id', 'start_date', 'end_date']])}

# And the same JSON links post IDs to constituency names
parties = [entry for entry in response_json['organizations']
           if 'classification' in entry.keys() if entry['classification'] == 'party']
partyid2party = {entry['id']: entry['name'] for entry in parties}

# So then we can lookup constituency names from member IDs
memberid2party = {member_id: ((start_date, end_date), partyid2party[party_id]) for member_id, (
    party_id, start_date, end_date) in memberid2partyid.items() if party_id in partyid2party.keys()}


def member_id_2_party(memberships):
    member_ids_to_lookup = []
    for member_id in memberships:
        if member_id in memberid2party.keys():
            member_ids_to_lookup.append(member_id)

    parties = dict()
    for member_id in member_ids_to_lookup:
        dates, party = memberid2party[member_id]
        parties[dates] = party
    return parties


people_df['parties'] = people_df.memberships.progress_apply(member_id_2_party)
display(people_df)

100%|██████████| 14224/14224 [00:00<00:00, 825980.98it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name,memberships,constituencies,parties
0,uk.org.publicwhip/person/10001,,Diane,Abbott,"[uk.org.publicwhip/member/2069, uk.org.publicw...","{('1987-06-11', '1992-03-16'): 'Hackney North ...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
1,uk.org.publicwhip/person/10002,,Gerry,Adams,"[uk.org.publicwhip/member/2196, uk.org.publicw...","{('1983-06-09', '1987-05-18'): 'Belfast West',...","{('1997-05-01', '2001-05-14'): 'Sinn Féin', ('..."
2,uk.org.publicwhip/person/10003,,Irene,Adams,"[uk.org.publicwhip/member/2201, uk.org.publicw...","{('1990-11-29', '1992-03-16'): 'Paisley North'...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
3,uk.org.publicwhip/person/10004,,Nick,Ainger,"[uk.org.publicwhip/member/2321, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Pembroke', ('1...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,"[uk.org.publicwhip/member/2323, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Coventry North...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
...,...,...,...,...,...,...,...
14219,uk.org.publicwhip/person/26280,,Sue,,[uk.org.publicwhip/lord/101744],{},{}
14220,uk.org.publicwhip/person/26281,,Richard,,[uk.org.publicwhip/lord/101746],{},{}
14221,uk.org.publicwhip/person/26282,,Philip,,[uk.org.publicwhip/lord/101748],{},{}
14222,uk.org.publicwhip/person/26283,,Ralph,,[uk.org.publicwhip/lord/101752],{},{}


In [9]:
# Lastly, we want redirect IDs to contain the same details as entries to which they redirect
def redirect(row): return people_df[people_df.person_id ==
                                    row.redirect_id].iloc[0] if row.redirect_id is not None else row


people_df = people_df.apply(redirect, axis=1)

assert people_df.redirect_id.apply(
    lambda redirect_id: redirect_id is not None).sum() == 0, "Still redirects required!"

people_df = people_df.drop('redirect_id', axis=1)
people_df = people_df.drop_duplicates('person_id')

In [10]:
# We're missing party IDs for some Prime Ministers...
people_df.loc[10267, :].parties = {
    ('1853-08-22', '1903-08-22'): 'Conservative'}  # Gascoyne-Cecil
people_df.loc[3656, :].parties = {
    ('1874-01-31', '1922-05-05'): 'Conservative'}  # Balfour
people_df.loc[4357, :].parties = {
    ('1868-11-17', '1908-04-22'): 'Liberal'}  # Campbell-Bannerman
people_df.loc[3678, :].parties = {
    ('1886-07-01', '1924-10-09'): 'Liberal'}  # Asquith
people_df.loc[3676, :].parties = {
    ('1890-04-10', '1945-01-12'): 'Liberal'}  # Lloyd George
people_df.loc[8686, :].parties = {
    ('1900-10-01', '1923-10-30'): 'Conservative'}  # Bonar Law
people_df.loc[8729, :].parties = {
    ('1908-02-29', '1937-06-01'): 'Conservative'}  # Baldwin
people_df.loc[8690, :].parties = {
    ('1906-01-12', '1937-11-09'): 'Labour'}  # MacDonald
people_df.loc[9034, :].parties = {
    ('1918-12-14', '1940-11-09'): 'Conservative'}  # Chamberlain
people_df.loc[9628, :].parties = {
    ('1900-10-01', '1964-09-25'): 'Conservative'}  # Churchill
people_df.loc[4517, :].parties = {
    ('1922-11-15', '1955-12-16'): 'Labour'}  # Attlee
people_df.loc[5282, :].parties = {
    ('1923-12-06', '1957-01-11'): 'Conservative'}  # Eden
people_df.loc[6261, :].parties = {
    ('1924-10-29', '1964-09-25'): 'Conservative'}  # Macmillan
people_df.loc[5205, :].parties = {
    ('1931-10-27', '1974-09-20'): 'Conservative'}  # Douglas-Home
people_df.loc[9917, :].parties = {
    ('1945-07-05', '1983-05-13'): 'Labour'}  # Wilson
people_df.loc[269, :].parties = {
    ('1950-02-23', '2001-05-14'): 'Conservative'}  # Heath
people_df.loc[1688, :].parties = {
    ('1945-07-05', '1987-05-18'): 'Labour'}  # Callaghan
people_df.loc[1026, :].parties = {
    ('1959-10-08', '2013-04-08'): 'Conservative'}  # Thatcher
people_df.loc[407, :].parties = {
    ('1979-05-03', '2001-05-14'): 'Conservative'}  # Major
people_df.loc[46, :].parties = {
    ('1983-06-09', '2007-06-27'): 'Labour'}  # Blair
people_df.loc[67, :].parties = {
    ('1983-06-09', '2015-03-30'): 'Labour'}  # Brown
people_df.loc[689, :].parties = {
    ('2001-06-07', '2016-09-12'): 'Conservative'}  # Cameron
people_df.loc[425, :].parties = {
    ('1997-05-01', '2022-05-26'): 'Conservative'}  # May
people_df.loc[722, :].parties = {
    ('2001-06-07', '2022-05-26'): 'Conservative'}  # Johnson

In [11]:
people_df.to_csv('people.csv')