# Analyse all house of commons speeches since 1970

[Part 1: Get a list of MPs and their affiliations](MP_speeches-Part1.ipynb)

## Part 2: Download all speeches belonging to MPs in list

[Part 3: Train bigram and trigram models and use them on all speeches](MP_speeches-Part3.ipynb)

[Part 4: Train an LDA topic model and process all speeches with it](MP_speeches-Part4.ipynb)

[Part 5: Analyse the results of the LDA model](MP_speeches-Part5.ipynb)

In [2]:
import pandas as pd

In [3]:
# Load the list of MPs from Part 1
mps = pd.read_hdf("list_of_mps.h5", "mps")

#### Use TheyWorkForYou's API to download all the speeches of a particular MP

In [3]:
def get_mp_speeches(mp_id):
    """Get all speeches of a particular MP from the TheyWorkForYou API as save them to a csv file under speeches/"""
    
    # Store TheyWorkForYou API key in separate config file
    from config import TWFY_API_KEY
    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    
    """Get speeches of a particular MP based on TheyWorkForYou id and convert data into long format pandas data frame.
    Each row represents one speech at a particular date and time"""
    all_speeches = pd.DataFrame()
    rows = [1]
    page_no=1
    while len(rows) > 0:
        t = requests.get("https://www.theyworkforyou.com/api/getDebates?key={api_key}&\
                     type=commons&person={person}&results_per_page=1000&num={num}&page={page}&output=js".format(api_key=TWFY_API_KEY,
                                                                                                               person=mp_id,
                                                                                                               num=1000,
                                                                                                               page=page_no))
        rows = t.json()["rows"]
        speeches = []
        # Loop over each row
        for row in rows:
            speeches.append({
                    'speech_id':row["gid"],
                    'speech_url':row["listurl"],
                    'mp_name':row["speaker"]["name"],
                    'mp_constituency':row["speaker"]["constituency"],
                    'mp_party':row["speaker"]["party"],
                    'mp_id':row["person_id"],
                    'date':pd.to_datetime(row["hdate"], format="%Y-%m-%d"),
                    'time':row["htime"],
                    'section_id':row["section_id"],
                    'subsection_id':row["subsection_id"],
                    'debate_title':row["parent"]["body"],
                    'body':BeautifulSoup(row["body"], "html5lib").get_text()
                })
        speeches = pd.DataFrame(speeches)

        # Concatenate onto complete speeches dataframe
        all_speeches = pd.concat([all_speeches, speeches], ignore_index=True)
        # Increment page_counter
        page_no += 1
    
    print("Got speeches for MP {0}".format(mp_id))
    # Write to new hdf file specifically for mp
    all_speeches.to_csv("speeches/mp-{0}.csv".format(mp_id), index=False)
    return True

#### Run the above function in parallel for all MPs in the list that do not have a speeches file yet
This will take a while (~15 mins, depending on your internet connection)

In [24]:
%%time
## Download all MP speeches if this is set to True
## This can take a while
if False:
    # Figure out which MPs we still need to download
    import glob
    import os
    
    downloaded_mps = [int(file.split("/")[-1].split(".")[0].split("-")[1]) for file in glob.glob("./speeches/mp-*.csv")]
    mps_to_download = [mp for mp in list(mps.index) if mp not in downloaded_mps]
    # Parallelise downloading of MP speeches
    from multiprocessing import Pool

    # Number of threads to use to fetch
    NUM_THREADS = 16
    # Make list of mp ids
    list_of_mp_ids = mps_to_download
    #list_of_mp_ids = list(mps.query("exists==False")["Person ID"])[:10]

    # Create pool of threads
    pool = Pool(NUM_THREADS)
    # Use pool.map to download speeches mp by mp
    results = pool.map(get_mp_speeches, list_of_mp_ids)
    pool.close()
    pool.join()

    # Remove the empty mp files
    import glob
    import os
    for file in glob.glob("./speeches/mp-*.csv"):
        if os.path.getsize(file) == 1:
            os.remove(file)

Got speeches for MP 10002
Got speeches for MP 13825
Got speeches for MP 13893
Got speeches for MP 25604
Got speeches for MP 11734
Got speeches for MP 13836
Got speeches for MP 13904
Got speeches for MP 25154
Got speeches for MP 17908
Got speeches for MP 25636
Got speeches for MP 10913
Got speeches for MP 10858
Got speeches for MP 10387
CPU times: user 432 ms, sys: 724 ms, total: 1.16 s
Wall time: 2.79 s
