# Setup


In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm

DATA_PATH = "data/"
DIST_PATH = "dist/"

# Convert speeches in each Congress into a CSV with added metadata


## Convert hein-daily `.txt` files to a dataframe


In [2]:
def convert_hein_txt_to_df(input_file_path):
    try:
        df = pd.read_csv(input_file_path, sep='|',
                         encoding='ISO-8859-1', on_bad_lines='skip')

    except Exception as e:
        print("Error reading the file:", e)
        return None

    return df

## Create a dataframe of file paths for all Congresses and their metadata


In [6]:
directories = ['speeches', 'speaker-maps', 'descriptions']
file_paths_dict = {dir_name: {} for dir_name in directories}

for dir_name in directories:
    for filename in os.listdir(DATA_PATH + dir_name):
        # Extract the Congress number from the filename
        congress_number = re.search(r'\d+', filename).group()
        file_paths_dict[dir_name][congress_number] = filename

file_paths_df = pd.DataFrame(file_paths_dict)

## Create a dataframe for each Congress and save to disk


In [8]:
individual_congresses_path = DIST_PATH + "individual_congresses/"

if not os.path.exists(individual_congresses_path):
    os.makedirs(individual_congresses_path)

for index, row in tqdm(file_paths_df.iterrows()):
    speeches_df = convert_hein_txt_to_df(
        DATA_PATH + "speeches/" + row['speeches'])

    speaker_maps_df = convert_hein_txt_to_df(
        DATA_PATH + "speaker-maps/" + row['speaker-maps'])
    speaker_maps_df = speaker_maps_df.rename(
        columns={'speakerid': 'speaker_id',
                 'lastname': 'last_name',
                 'firstname': 'first_name',
                 'nonvoting': 'non_voting'})

    descriptions_df = convert_hein_txt_to_df(
        DATA_PATH + "descriptions/" + row['descriptions'])
    descriptions_df = descriptions_df.drop(
        ['first_name', 'last_name', 'state', 'gender', 'chamber'], axis=1)  # Drop redundant columns

    congress_df = speeches_df.merge(
        speaker_maps_df, on='speech_id', how='left')
    congress_df = congress_df.merge(
        descriptions_df, on='speech_id', how='left')

    congress_file_name = "congress_" + \
        row['speeches'].split('.')[0].split('_')[1] + ".csv"

    congress_df.to_csv(individual_congresses_path +
                       congress_file_name, index=False)

18it [01:27,  4.86s/it]


## Create a combined dataframe for the entire Congressional Record


In [8]:
individual_congresses_path = DIST_PATH + "individual_congresses/"

file_paths = [individual_congresses_path +
              file_name for file_name in os.listdir(individual_congresses_path)]

congressional_record_df = pd.concat(
    [
        pd.read_csv(file_path).assign(
            congress=file_path.split('/')[-1].split('_')[-1].split('.')[0]
        )
        for file_path in file_paths
    ]
)

congressional_record_df.to_csv(
    DIST_PATH + "congressional_record.csv", index=False)