## Task 2 


### Task 2.1
Get list of US politicians with political affiliation. 

2 sources:
- https://github.com/casmlab/politicians-tweets 
- https://www.congress.gov/members?q={%22congress%22:[%22110%22,%22111%22,%22112%22,%22113%22,%22114%22,%22115%22,%22116%22,117]}

Take 1st list, keep politicians whose affiliation is known. Then merge with congress list to be sure we have a good dataset.
List of politicians is in `data/ressources/politicians.json`.

In [4]:
import os 
import json
import time
import csv
import bz2

# from tqdm.contrib.concurrent import process_map
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import pandas as pd
from multiprocesspandas import applyparallel
import numpy as np

# tqdm for pandas
tqdm.pandas()

In [5]:
# Config part
DATA_PATH = "data"
PKL_PATH = os.path.join(DATA_PATH, "pkl")
RESOURCES_PATH = os.path.join(DATA_PATH, "resources")

In [6]:
def load_df(
    file_name: str, mode: str = "pandas", save: bool = True, chunksize: int = 1_000_000
) -> pd.DataFrame:
    """
    Load a dataset in DataFrame from a .json.bz2 archive.

    file_name: str
        Name of .json.bz2 archive to load from `DATA_PATH`.

    mode: str = "pandas" | "bz2"
        Either use pandas read_json function or homemade bz2 function. This is usually faster (but makes my computer crash for some reason).

    save: bool
        Save the dataframe as a pickle file in `PKL_PATH`.
    """

    file_path = os.path.join(DATA_PATH, file_name)
    pkl_path = os.path.join(PKL_PATH, f"{file_name}.pkl")

    if mode == "bz2":
        keys = ["quoteID", "quotation", "speaker", "date", "numOccurrences", "phase"]

        with bz2.open(file_path, "rb") as quote_file:
            df = pd.DataFrame(
                [
                    dict(zip(keys, map(json.loads(instance).get, keys)))
                    for instance in tqdm(quote_file)
                ]
            )
    else:
        df_lst = []
        with pd.read_json(file_path, lines=True, chunksize=chunksize) as df_reader:
            for i, chunk in enumerate(df_reader):
                df_lst.append(chunk)

        df = pd.concat(df_lst)

    if save and not os.path.exists(pkl_path):
        file_name = file_name.strip(".json.bz2")
        df.to_pickle(os.path.join(PKL_PATH, pkl_path))

    return df

In [5]:
def to_csv(file_name: str, pol_lst: list) -> None:
    """
    Write list to csv.
    """

    csv_path = os.path.join("data", "resources", file_name)

    with open(csv_path, "w") as f:
        writer = csv.writer(f, delimiter=" ")
        writer.writerow(["Name", "Party"])

        for member in pol_lst:
            writer.writerow([el for el in member])

#### Github dataset

In [10]:
file_name = "politicians_github.json"
file_path = os.path.join("data", "resources", file_name)

with open(file_path, "r") as f:
    json = json.load(f)

In [11]:
json.keys()

dict_keys(['id', 'id_str', 'screen_name', 'confirmed_account_type', 'state', 'twitter_name', 'real_name', 'bioguide', 'office_holder', 'party', 'district', 'level', 'woman', 'birthday', 'last_updated'])

In [12]:
# Only keep politicians with political affiliation
politicians = []

for i in tqdm(range(1, len(json["id"]))):
    i = str(i)
    affiliation = json["party"][i]
    screen_name = json["screen_name"][i]
    elected = json["office_holder"][i] is not None

    if affiliation is not None and affiliation in ("Republican", "Democratic"):
        politicians.append((json["real_name"][i], affiliation, elected))
    elif screen_name == "realdonaldtrump":
        politicians.append(("Donald Trump", "Republican", True))
    elif screen_name == "barackobama":
        politicians.append(("Barack Obama", "Democratic", True))


100%|██████████| 9979/9979 [00:00<00:00, 311395.34it/s]


In [13]:
# Count how many politicians are "elected" (-> congress members)
sum(pol[-1] for pol in politicians)

1107

All politicians are in Congress!

In [14]:
# Sanity check
print(f"{len(politicians)=}") 
politicians[:10]

len(politicians)=1107


[('Mark Green', 'Republican', True),
 ('Pete Stauber', 'Republican', True),
 ('Derek Kilmer', 'Democratic', True),
 ('Andy Harris', 'Republican', True),
 ('Donald Payne', 'Democratic', True),
 ('A. Ferguson', 'Republican', True),
 ('Richard Hudson', 'Republican', True),
 ('Edward Markey', 'Democratic', True),
 ('Bobby Rush', 'Democratic', True),
 ('Gregory Meeks', 'Democratic', True)]

In [31]:
# Write to file
to_csv("politicians_github.csv", politicians)

#### US Congress dataset

In [15]:
URL = 'https://www.congress.gov/members?q={"congress":["110","111","112","113","114","115","116",117]}&pageSize=250'

In [20]:
def sanitize_name(name: str) -> str:
    """
    Strip and clean name.
    "Senator Cruz, Ted" -> "Ted Cruz"
    """

    for element in ("Representative", "Senator"):
        name = name.strip(element)

    name = " ".join(name.split(",")[::-1])
    name = name.strip()
    
    return name

In [18]:
congress_members = []

# Download each congress page
with requests.Session() as s:
    for page_number in tqdm(range(1, 6)):
        r  = s.get(URL, params={"page": page_number})
        soup = BeautifulSoup(r.text, "html.parser")

        members = soup.find_all("li", class_="compact")

        for member in members:
            # Scrape the information
            items = member.find_all("span", class_="result-item")
            name = sanitize_name(member.span.a.text)
            
            for item in items:
                if item.strong.text == "Party:":
                    affiliation = item.span.text

            congress_members.append((name, affiliation))

100%|██████████| 5/5 [00:20<00:00,  4.07s/it]


In [19]:
# Sanity check
len(congress_members) == 1158

True

In [32]:
# Write to file
to_csv("politicians_congress.csv", politicians)

#### Compare lists

Actually might not be useful (and less of a headache) to just take the congress list, since all politicians from the github list are elected (meaning they are or were congress members).

### Task 2.2 - Extract quotes from politicians

I downloaded another list, from the congress [website](https://bioguide.congress.gov/search?index=%22bioguideprofiles%22&size=12&matches=[]&filters={%22jobPositions.congressAffiliation.partyAffiliation.party.name%22:[%22Democrat%22,%22Republican%22],%22jobPositions.congressAffiliation.congress.name%22:[%22The%20110th%20United%20States%20Congress%22,%22The%20111th%20United%20States%20Congress%22,%22The%20112th%20United%20States%20Congress%22,%22The%20113th%20United%20States%20Congress%22,%22The%20114th%20United%20States%20Congress%22,%22The%20115th%20United%20States%20Congress%22,%22The%20116th%20United%20States%20Congress%22,%22The%20117th%20United%20States%20Congress%22]}&sort=[{%22_score%22:true},{%22field%22:%22familyName%22,%22order%22:%22asc%22},{%22field%22:%22middleName%22,%22order%22:%22asc%22},{%22field%22:%22givenName%22,%22order%22:%22asc%22}])


Those are the politicians from 2007 to 2009. The json is called `data/resources/congress_biolist.json`. What is nice is that we have the "congress bio ID" of each congress member, which is also present in the `speaker_attributes.parquet` file (field `US_congress_bio_ID`). We will use that to extract the tweets from the politicians.

Testing with quotes from 2008.

In [7]:
df2 = load_df("quotes-2008.json.bz2", chunksize=100_000)

KeyboardInterrupt: 

In [2]:
len(df2)

NameError: name 'df2' is not defined

In [6]:
politicians_filepath = os.path.join(RESOURCES_PATH, "congress_biolist.json")
quotes_filepath = os.path.join(DATA_PATH, "quotes-2008.json.bz2")
# keys = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'phase']

politicians_df = pd.read_json(politicians_filepath).drop("congresses", axis=1)
quotes_df = pd.read_pickle(os.path.join(PKL_PATH, "quotes-2008.pkl"))

In [21]:
data_lst = []
with pd.read_json(quotes_filepath, lines=True, chunksize=2000) as df_reader:
    for i, chunk in enumerate(df_reader):
        data_lst.append(chunk)
        if i == 1:
            break

In [28]:
data_df = pd.concat(data_lst)
len(data_df)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   quoteID         4000 non-null   object        
 1   quotation       4000 non-null   object        
 2   speaker         4000 non-null   object        
 3   qids            4000 non-null   object        
 4   date            4000 non-null   datetime64[ns]
 5   numOccurrences  4000 non-null   int64         
 6   probas          4000 non-null   object        
 7   urls            4000 non-null   object        
 8   phase           4000 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 281.4+ KB


In [7]:
quotes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4641330 entries, 0 to 4641329
Data columns (total 6 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   quoteID         object
 1   quotation       object
 2   speaker         object
 3   date            object
 4   numOccurrences  int64 
 5   phase           object
dtypes: int64(1), object(5)
memory usage: 212.5+ MB


In [4]:
# Just taking a subsample of the quotes dataset to check if everything is working in a timely manner
quotes_df = quotes_df.sample(int(np.floor(0.05 * len(quotes_df))))
len(quotes_df)

232066

In [5]:
# Sanity check
quotes_df.head(5)

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase
4043163,2008-11-07-048453,there are 670 helicopters in new zealand and 8...,john sinclair,2008-11-07 00:57:33,1,A
1610076,2008-12-02-024980,"if they fail to do so, there is nowhere else t...",sonny perdue,2008-12-02 17:40:16,3,A
3420914,2008-09-04-060729,"ugh, he's just writing whatever he feels like",dave eggers,2008-09-04 01:38:30,2,A
926354,2008-09-10-074668,with a mission to help individuals objectively...,david fleming,2008-09-10 12:56:41,1,A
2875163,2008-09-29-060554,we are conducting our own testing and will nee...,daniel ellis,2008-09-29 19:34:00,6,A


In [6]:
# Sanity check
politicians_df.head(5)

Unnamed: 0,id,givenName,familyName,unaccentedGivenName,unaccentedFamilyName,birthYear,deathYear,middleName,unaccentedMiddleName,nickName,honorificPrefix,honorificSuffix
0,A000014,Neil,Abercrombie,Neil,Abercrombie,1938,,,,,,
1,A000374,Ralph,Abraham,Ralph,Abraham,1954,,,,,,
2,A000022,Gary,Ackerman,Gary,Ackerman,1942,,Leonard,Leonard,,,
3,A000370,Alma,Adams,Alma,Adams,1946,,,,,,
4,A000366,Sandra,Adams,Sandra,Adams,1956,,,,Sandy,,


In [7]:
politicians_df["fullName"] = politicians_df["givenName"] + " " + politicians_df["familyName"]
politicians_df["fullName"] = politicians_df["fullName"].str.lower()
names = politicians_df["fullName"].tolist()


In [18]:
# about 55min on my shitty laptop (for the whole 4millions dataset)
# 10% subset takes about 5min
quotes_df["subset"] = quotes_df["speaker"].progress_apply(lambda x: pd.Series(x).isin(names))
quotes_df["subset"].sum()

100%|██████████| 46413/46413 [00:35<00:00, 1304.15it/s]


In [24]:
# about 1h20min on my shitty laptop
# 10% subset takes about 5min
# -> preferred method if we want to check for "republicans" or "democrats", slower otherwise
quotes_df["subset"] = quotes_df["speaker"].progress_apply(lambda x: pd.Series(x).str.contains("|".join(names), case=False))
quotes_df["subset"].sum()

100%|██████████| 46413/46413 [00:49<00:00, 943.37it/s] 


In [13]:
congress_members = politicians_df["fullName"].tolist()  # redefined here for clarity

def extract_subset(orig_df: pd.DataFrame, multiproc=False) -> pd.DataFrame:
    """
    This function extracts the quotes of speakers that are in the congress list.

    It returns the number of extracted quotes and the extracted dataframe.
    """
    if multiproc:
        orig_df["subset"] = orig_df["speaker"].apply_parallel(lambda x: pd.Series(x).isin(names))
    else:
        orig_df["subset"] = orig_df["speaker"].progress_apply(lambda x: pd.Series(x).isin(names))

    return orig_df["subset"].sum(), orig_df[orig_df["subset"] == True]
    

In [15]:
# Without multiprocessing
# 3m05s on 5% of the 2008 dataset (~230k quotes)
subset_count, subset_2008 = extract_subset(quotes_df)

100%|██████████| 232066/232066 [03:06<00:00, 1245.68it/s]


In [16]:
# Testing multiprocessing for extraction
# 50s on 5% of the 2008 dataset (~230k quotes)
subset_count, subset_2008 = extract_subset(quotes_df, multiproc=True)

In [35]:
# Goal is to do a pipeline to automatically extract quotes for a quotes dataset
# The datasets were already loaded from the json.bz2 format and converted to .pkl in `data/pkl`

# Get the names
quotes_datasets = [os.path.join("data", "pkl", f"quotes-20{i:02d}.pkl") for i in range(8, 21)]  

# For each dataset, extract the quotes from congress members
# and save the extracted quotes as pkl for easier handling
for i, dataset in enumerate(quotes_datasets, start=1):
    print(f"{i}/{len(quotes_datasets)} {dataset}:")
    try:
        complete_df = pd.read_pickle(dataset)  # Load dataset
    except FileNotFoundError:
        print(f"{dataset} not found, loading from .json.bz2")
        complete_df = load_df(dataset)

    _, subset_df = extract_subset(complete_df)
    subset_df.to_pickle(os.path.join("data", "pkl", f"extracted_{dataset}"))

['data/pkl/quotes-2008.pkl',
 'data/pkl/quotes-2009.pkl',
 'data/pkl/quotes-2010.pkl',
 'data/pkl/quotes-2011.pkl',
 'data/pkl/quotes-2012.pkl',
 'data/pkl/quotes-2013.pkl',
 'data/pkl/quotes-2014.pkl',
 'data/pkl/quotes-2015.pkl',
 'data/pkl/quotes-2016.pkl',
 'data/pkl/quotes-2017.pkl',
 'data/pkl/quotes-2018.pkl',
 'data/pkl/quotes-2019.pkl',
 'data/pkl/quotes-2020.pkl']

In [41]:
quotes_datasets = [os.path.join("data", "pkl", f"quotes-20{i:02d}.pkl") for i in range(8, 21)]  

for i, dataset in enumerate(quotes_datasets, start=1):
    print(f"{i}/{len(quotes_datasets)} {dataset}:")
    try:
        print("Reading from pkl")
        complete_df = pd.read_pickle(dataset)  # Load dataset
    except FileNotFoundError:
        print(f"{dataset} not found, loading from .json.bz2")
        complete_df = load_df(dataset)

1/13 data/pkl/quotes-2008.pkl:
2/13 data/pkl/quotes-2009.pkl:
data/pkl/quotes-2009.pkl not found, loading from .json.bz2


NameError: name 'DATA_PATH' is not defined