## Task 2 


### Task 2.1
Get list of US politicians with political affiliation. 

2 sources:
- https://github.com/casmlab/politicians-tweets 
- https://www.congress.gov/members?q={%22congress%22:[%22110%22,%22111%22,%22112%22,%22113%22,%22114%22,%22115%22,%22116%22,117]}

Take 1st list, keep politicians whose affiliation is known. Then merge with congress list to be sure we have a good dataset.
List of politicians is in `data/ressources/politicians.json`.

In [1]:
import os 
import json
import time
import csv
import bz2

# from tqdm.contrib.concurrent import process_map
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# tqdm for pandas
tqdm.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# Config part
DATA_PATH = "data"
PKL_PATH = os.path.join(DATA_PATH, "pkl")
RESOURCES_PATH = os.path.join(DATA_PATH, "resources")

In [3]:
def load_df(
    file_name: str, mode: str = "pandas", save: bool = True, chunksize: int = 1_000_000
) -> pd.DataFrame:
    """
    Load a dataset in DataFrame from a .json.bz2 archive.

    file_name: str
        Name of .json.bz2 archive to load from `DATA_PATH`.

    mode: str = "pandas" | "bz2"
        Either use pandas read_json function or homemade bz2 function. This is usually faster (but makes my computer crash for some reason).

    save: bool
        Save the dataframe as a pickle file in `PKL_PATH`.
    """

    file_path = os.path.join(DATA_PATH, file_name)

    if mode == "bz2":
        keys = ["quoteID", "quotation", "speaker", "date", "numOccurrences", "phase"]

        with bz2.open(file_path, "rb") as quote_file:
            df = pd.DataFrame(
                [
                    dict(zip(keys, map(json.loads(instance).get, keys)))
                    for instance in tqdm(quote_file)
                ]
            )
    else:
        if not save:
            print("Please enable save option.")
            return

        with pd.read_json(file_path, lines=True, chunksize=chunksize) as df_reader:
            for i, chunk in enumerate(df_reader):
                file_name = file_name.strip(".json.bz2")
                pkl_path = os.path.join(PKL_PATH, f"{file_name}-{i:03d}.pkl")
                chunk.to_pickle(pkl_path)

                if i == 1:
                    break
        return

    file_name = file_name.strip(".json.bz2")
    pkl_path = os.path.join(PKL_PATH, f"{file_name}.pkl")

    if save and not os.path.exists(pkl_path):
        df.to_pickle(os.path.join(PKL_PATH, pkl_path))

    return df

In [4]:
def to_csv(file_name: str, pol_lst: list) -> None:
    """
    Write list to csv.
    """

    csv_path = os.path.join("data", "resources", file_name)

    with open(csv_path, "w") as f:
        writer = csv.writer(f, delimiter=" ")
        writer.writerow(["Name", "Party"])

        for member in pol_lst:
            writer.writerow([el for el in member])

In [5]:
def get_pkl_year(year: int) -> list:
    """
    Returns a list of the pkl files present in `data/pkl/{year}`.
    """

    dirs = os.listdir(os.path.join(PKL_PATH, str(year)))

    return [os.path.join(str(year), dir) for dir in dirs]

#### Github dataset

In [10]:
file_name = "politicians_github.json"
file_path = os.path.join("data", "resources", file_name)

with open(file_path, "r") as f:
    json = json.load(f)

In [11]:
json.keys()

dict_keys(['id', 'id_str', 'screen_name', 'confirmed_account_type', 'state', 'twitter_name', 'real_name', 'bioguide', 'office_holder', 'party', 'district', 'level', 'woman', 'birthday', 'last_updated'])

In [12]:
# Only keep politicians with political affiliation
politicians = []

for i in tqdm(range(1, len(json["id"]))):
    i = str(i)
    affiliation = json["party"][i]
    screen_name = json["screen_name"][i]
    elected = json["office_holder"][i] is not None

    if affiliation is not None and affiliation in ("Republican", "Democratic"):
        politicians.append((json["real_name"][i], affiliation, elected))
    elif screen_name == "realdonaldtrump":
        politicians.append(("Donald Trump", "Republican", True))
    elif screen_name == "barackobama":
        politicians.append(("Barack Obama", "Democratic", True))


100%|██████████| 9979/9979 [00:00<00:00, 311395.34it/s]


In [13]:
# Count how many politicians are "elected" (-> congress members)
sum(pol[-1] for pol in politicians)

1107

All politicians are in Congress!

In [14]:
# Sanity check
print(f"{len(politicians)=}") 
politicians[:10]

len(politicians)=1107


[('Mark Green', 'Republican', True),
 ('Pete Stauber', 'Republican', True),
 ('Derek Kilmer', 'Democratic', True),
 ('Andy Harris', 'Republican', True),
 ('Donald Payne', 'Democratic', True),
 ('A. Ferguson', 'Republican', True),
 ('Richard Hudson', 'Republican', True),
 ('Edward Markey', 'Democratic', True),
 ('Bobby Rush', 'Democratic', True),
 ('Gregory Meeks', 'Democratic', True)]

In [31]:
# Write to file
to_csv("politicians_github.csv", politicians)

#### US Congress dataset

In [15]:
URL = 'https://www.congress.gov/members?q={"congress":["110","111","112","113","114","115","116",117]}&pageSize=250'

In [20]:
def sanitize_name(name: str) -> str:
    """
    Strip and clean name.
    "Senator Cruz, Ted" -> "Ted Cruz"
    """

    for element in ("Representative", "Senator"):
        name = name.strip(element)

    name = " ".join(name.split(",")[::-1])
    name = name.strip()
    
    return name

In [18]:
congress_members = []

# Download each congress page
with requests.Session() as s:
    for page_number in tqdm(range(1, 6)):
        r  = s.get(URL, params={"page": page_number})
        soup = BeautifulSoup(r.text, "html.parser")

        members = soup.find_all("li", class_="compact")

        for member in members:
            # Scrape the information
            items = member.find_all("span", class_="result-item")
            name = sanitize_name(member.span.a.text)
            
            for item in items:
                if item.strong.text == "Party:":
                    affiliation = item.span.text

            congress_members.append((name, affiliation))

100%|██████████| 5/5 [00:20<00:00,  4.07s/it]


In [19]:
# Sanity check
len(congress_members) == 1158

True

In [32]:
# Write to file
to_csv("politicians_congress.csv", politicians)

#### Compare lists

Actually might not be useful (and less of a headache) to just take the congress list, since all politicians from the github list are elected (meaning they are or were congress members).

### Task 2.2 - Extract quotes from politicians

I downloaded another list, from the congress [website](https://bioguide.congress.gov/search?index=%22bioguideprofiles%22&size=12&matches=[]&filters={%22jobPositions.congressAffiliation.partyAffiliation.party.name%22:[%22Democrat%22,%22Republican%22],%22jobPositions.congressAffiliation.congress.name%22:[%22The%20110th%20United%20States%20Congress%22,%22The%20111th%20United%20States%20Congress%22,%22The%20112th%20United%20States%20Congress%22,%22The%20113th%20United%20States%20Congress%22,%22The%20114th%20United%20States%20Congress%22,%22The%20115th%20United%20States%20Congress%22,%22The%20116th%20United%20States%20Congress%22,%22The%20117th%20United%20States%20Congress%22]}&sort=[{%22_score%22:true},{%22field%22:%22familyName%22,%22order%22:%22asc%22},{%22field%22:%22middleName%22,%22order%22:%22asc%22},{%22field%22:%22givenName%22,%22order%22:%22asc%22}])


Those are the politicians from 2007 to 2009. The json is called `data/resources/congress_biolist.json`. What is nice is that we have the "congress bio ID" of each congress member, which is also present in the `speaker_attributes.parquet` file (field `US_congress_bio_ID`). We will use that to extract the tweets from the politicians.

Testing with quotes from 2008.

In [8]:
politicians_filepath = os.path.join(RESOURCES_PATH, "congress_biolist.json")
# quotes_filepath = os.path.join(DATA_PATH, "quotes-2008.json.bz2")
# keys = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'phase']

politicians_df = pd.read_json(politicians_filepath)
# politicians_df = politicians_df.drop("congresses", axis=1)
# quotes_df = pd.read_pickle(os.path.join(PKL_PATH, "quotes-2008.pkl"))

Problem: Donald Trump is not in the dataset (because he was only President, not senator or representative and thus, not a congress member). I will manually add him now.

In [4]:
quotes_2020_00 = pd.read_pickle(os.path.join(PKL_PATH, "2020", "quotes-2020-000.pkl"))

In [8]:
quotes_2020_00[quotes_2020_00["speaker"].str.contains("Trump")]["speaker"].unique()

array(['Donald Trump', 'President Donald Trump', 'President Trump',
       'Melania Trump', 'Eric Trump', 'Ivanka Trump',
       'Donald Trump Jr. .', 'Donald Trump Jr', 'Donald Trump , Jr. .',
       'Donald J. Trump', 'President Donald J. Trump', 'Barron Trump',
       'president Donald Trump', 'Lara Trump', 'DONALD Trump',
       'Donald J Trump', 'PRESIDENT Donald Trump', 'president Trump',
       'Donald John Trump', 'PRESIDENT Trump'], dtype=object)

In [9]:
# Manually add Donald Trump
# Not so elegant trick to capture variations of the name
# Should refactore to another solution (alias field?) when we have time

donald_json1 = {
    "id": np.nan,
    "givenName": "Donald",
    "familyName": "Trump",
    "unaccentedGivenName": "Donald",
    "unaccentedFamilyName": "Trump",
    "birthYear": 1946,
    "deathYear": np.nan,
    "congresses": [
        {
            "position": "President",
            "congressNumber": np.nan,
            "stateName": np.nan,
            "parties": ["Republican"],
        }
    ],
    "middleName": "John",
    "unaccentedMiddleName": "John",
    "nickName": np.nan,
    "honorificPrefix": np.nan,
    "honorificSuffix": np.nan,
}

donald_json2 = {
    "id": np.nan,
    "givenName": "President",
    "familyName": "Trump",
    "unaccentedGivenName": "President",
    "unaccentedFamilyName": "Trump",
    "birthYear": 1946,
    "deathYear": np.nan,
    "congresses": [
        {
            "position": "President",
            "congressNumber": np.nan,
            "stateName": np.nan,
            "parties": ["Republican"],
        }
    ],
    "middleName": "John",
    "unaccentedMiddleName": "John",
    "nickName": np.nan,
    "honorificPrefix": np.nan,
    "honorificSuffix": np.nan,
}

donald_json3 = {
    "id": np.nan,
    "givenName": "President Donald",
    "familyName": "Trump",
    "unaccentedGivenName": "President Donald",
    "unaccentedFamilyName": "Trump",
    "birthYear": 1946,
    "deathYear": np.nan,
    "congresses": [
        {
            "position": "President",
            "congressNumber": np.nan,
            "stateName": np.nan,
            "parties": ["Republican"],
        }
    ],
    "middleName": "John",
    "unaccentedMiddleName": "John",
    "nickName": np.nan,
    "honorificPrefix": np.nan,
    "honorificSuffix": np.nan,
}

politicians_df = politicians_df.append(
    pd.DataFrame([donald_json1, donald_json2, donald_json3]), ignore_index=True
)


In [11]:
politicians_df.tail(5)

Unnamed: 0,id,givenName,familyName,unaccentedGivenName,unaccentedFamilyName,birthYear,deathYear,congresses,middleName,unaccentedMiddleName,nickName,honorificPrefix,honorificSuffix
1153,Z000017,Lee,Zeldin,Lee,Zeldin,1980,,"[{'position': 'Representative', 'congressNumbe...",M,M,,,
1154,Z000018,Ryan,Zinke,Ryan,Zinke,1961,,"[{'position': 'Representative', 'congressNumbe...",,,,,
1155,,Donald,Trump,Donald,Trump,1946,,"[{'position': 'President', 'congressNumber': n...",John,John,,,
1156,,President,Trump,President,Trump,1946,,"[{'position': 'President', 'congressNumber': n...",John,John,,,
1157,,President Donald,Trump,President Donald,Trump,1946,,"[{'position': 'President', 'congressNumber': n...",John,John,,,


In [12]:
# Export new df to json
politicians_df.to_json(os.path.join(RESOURCES_PATH, "new_congress_biolist.json"))

In [9]:
politicians_df["fullName"] = politicians_df["givenName"] + " " + politicians_df["familyName"]
politicians_df["fullName"] = politicians_df["fullName"].str.lower()

congress_members = politicians_df["fullName"].tolist()  # redefined here for clarity

def extract_subset(orig_df: pd.DataFrame, multiproc=False) -> pd.DataFrame:
    """
    This function extracts the quotes of speakers that are in the congress list.

    It returns the number of extracted quotes and the extracted dataframe.
    """

    if multiproc:
        orig_df["subset"] = orig_df["speaker"].parallel_apply(
            lambda x: pd.Series(x.lower()).str.contains("|".join(congress_members))
        )
    else:
        orig_df["subset"] = orig_df["speaker"].progress_apply(
            lambda x: pd.Series(x.lower()).str.contains("|".join(congress_members))
        )

    return orig_df["subset"].sum(), orig_df[orig_df["subset"] == True]
    

In [10]:
# 10% subset
quotes_2020_00 = quotes_2020_00.sample(int(0.1*len(quotes_2020_00)))

In [12]:
# Testing multiprocessing for extraction
subset_count, subset_2020 = extract_subset(quotes_2020_00)
subset_count

100%|██████████| 50000/50000 [00:47<00:00, 1061.10it/s]


In [16]:
subset_2020["speaker"].value_counts()

President Donald Trump    266
President Trump           125
Donald Trump               73
Elizabeth Warren           72
Mike Pompeo                65
                         ... 
John kerry                  1
Nikema Williams             1
Michael Fitzpatrick         1
Virginia Foxx               1
Vicky Hartzler              1
Name: speaker, Length: 344, dtype: int64

In [35]:
# Goal is to do a pipeline to automatically extract quotes for a quotes dataset
# The datasets were already loaded from the json.bz2 format and converted to .pkl in `data/pkl`

# Get the names
quotes_datasets = [os.path.join("data", "pkl", f"quotes-20{i:02d}.pkl") for i in range(8, 21)]  

# For each dataset, extract the quotes from congress members
# and save the extracted quotes as pkl for easier handling
for i, dataset in enumerate(quotes_datasets, start=1):
    print(f"{i}/{len(quotes_datasets)} {dataset}:")
    try:
        complete_df = pd.read_pickle(dataset)  # Load dataset
    except FileNotFoundError:
        print(f"{dataset} not found, loading from .json.bz2")
        complete_df = load_df(dataset)

    _, subset_df = extract_subset(complete_df)
    subset_df.to_pickle(os.path.join("data", "pkl", f"extracted_{dataset}"))

['data/pkl/quotes-2008.pkl',
 'data/pkl/quotes-2009.pkl',
 'data/pkl/quotes-2010.pkl',
 'data/pkl/quotes-2011.pkl',
 'data/pkl/quotes-2012.pkl',
 'data/pkl/quotes-2013.pkl',
 'data/pkl/quotes-2014.pkl',
 'data/pkl/quotes-2015.pkl',
 'data/pkl/quotes-2016.pkl',
 'data/pkl/quotes-2017.pkl',
 'data/pkl/quotes-2018.pkl',
 'data/pkl/quotes-2019.pkl',
 'data/pkl/quotes-2020.pkl']

In [28]:
year = 2008
files = get_pkl_year(year)

# Extract the quotes of interest of each chunk
all_extracted = []
for file in files:
    df = pd.read_pickle(os.path.join(PKL_PATH, file))
    _, subset_df = extract_subset(df)
    all_extracted.append(subset_df)

# Merge them into a new df
df_extracted = pd.concat(all_extracted)

# Save the df as pkl
pkl_name = f"extracted-quotes-{year}.pkl"
df_extracted.to_pickle(os.path.join(PKL_PATH, pkl_name))

100%|██████████| 10000/10000 [00:07<00:00, 1264.36it/s]
100%|██████████| 10000/10000 [00:07<00:00, 1307.20it/s]


In [10]:
df2_extracted = pd.read_pickle(os.path.join(PKL_PATH, "extracted-quotes-2016.pkl"))
df2 = pd.read_pickle(os.path.join(PKL_PATH, "2016", "quotes-2016-26.pkl"))
df2 = df2.sample(100_000)

In [14]:
length, df2_extracted2 = extract_subset(df2, multiproc=True)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

In [15]:
length

1778