In [1]:
import pandas as pd
import numpy as np
import os

folder_path = "lda_senate_output"
df_list = []

In [2]:
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        
        # read the CSV file into df
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        
        # For verification
        print(df.shape)
        
        # add df to list
        df_list.append(df)

# concatenate df
combined_df = pd.concat(df_list, ignore_index=True)

(165512, 9)
(185281, 9)
(163693, 9)
(182203, 9)
(165848, 9)
(182020, 9)
(164825, 9)
(165125, 9)
(124861, 9)
(162956, 9)


In [3]:
combined_df

Unnamed: 0,date,amount,honoree_name,Payee Name,Contribution Type,contributor_name,state,country,Registrant Name
0,2013-01-23,1000.0,Mark Begich,MARK BEGICH FOR SENATE\n(ACT BLUE),feca,"PAVEL, MARY",DC,US,"SONOSKY, CHAMBERS, SACHSE, ENDRESON & PERRY, LLP"
1,2013-01-24,1000.0,Max Baucus,MONTANANS FOR MAX,feca,"PAVEL, MARY",DC,US,"SONOSKY, CHAMBERS, SACHSE, ENDRESON & PERRY, LLP"
2,2013-01-15,250.0,Ben Lujan,PEOPLE FOR BEN LUJAN,feca,"HONG, JOCELYN",DC,US,"TWENTY-FIRST CENTURY GROUP, INC."
3,2013-03-05,50.0,Jeanne Shaheen,JEANNE SHAHEEN FOR SENATE,feca,"OLIVE, DAVID M",DC,US,CATALYST PARTNERS LLC
4,2013-02-15,500.0,Tim Griffin,TIM GRIFFIN FOR CONGRESS,feca,"OLIVE, DAVID M",DC,US,CATALYST PARTNERS LLC
...,...,...,...,...,...,...,...,...,...
1662319,2022-09-23,500.0,Sen. Patty Murray,JONATHAN CORDONE,feca,SELF,DC,US,CORDONE CONSULTING LLC
1662320,2022-09-29,2900.0,Sen. Raphael Warnock,JONATHAN CORDONE,feca,SELF,DC,US,CORDONE CONSULTING LLC
1662321,2022-09-13,500.0,Rep. Chris Pappas,JONATHAN CORDONE,feca,SELF,DC,US,CORDONE CONSULTING LLC
1662322,2022-09-23,500.0,Sen. Patty Murray,JONATHAN CORDONE,feca,SELF,DC,US,CORDONE CONSULTING LLC


In [4]:
# How many unique honoree_name
print(len(df["honoree_name"].value_counts()))

21294


In [6]:
# Remove hyphens (assuming a hyphen does not mean a negative value which would not make sense)
combined_df["amount"] = combined_df["amount"].abs()

In [8]:
# Group by Honoree Name and sum the amount contributed
amount_by_honoree = combined_df.groupby("honoree_name")["amount"].sum()
amount_by_honoree = amount_by_honoree.to_frame().reset_index()
amount_by_honoree.sort_values("amount",ascending=True, inplace=True)

# Export
amount_by_honoree.to_csv("../amount_by_honoree.csv",index=False)

amount_by_honoree

Unnamed: 0,honoree_name,amount
96084,Tim Griffin (Check originally issued on 11/1/2...,0.00
50741,NATAPAC,0.00
102768,"William Doyle, Commissioner, Federal Maritime ...",0.00
32,"""Challenges and Opportunities in an Interconne...",0.00
21089,Daniel Fellenbaum,0.00
...,...,...
52560,,21752413.65
51120,NRSC,23434483.20
51783,National Republican Senatorial Committee,27269798.99
51701,National Republican Congressional Committee,28152132.85


In [9]:
# Group by Registrant and sum the amount contributed
amount_by_registrant = combined_df.groupby("Registrant Name")["amount"].sum()
amount_by_registrant = amount_by_registrant.to_frame().reset_index()
amount_by_registrant.sort_values("amount",ascending=True, inplace=True)

# Export
amount_by_registrant.to_csv("../amount_by_registrant.csv",index=False)

amount_by_registrant

Unnamed: 0,Registrant Name,amount
2581,LAFAYETTE GROUP,0.00
2027,"HADASSAH, THE WOMEN'S ZIONIST ORGANIZATION OF ...",1.00
2456,JOHNSON FOOD AND BEVERAGE CONSULTING,4.00
3076,MR. IAN FERGUSON,5.00
5191,WESTERN ORGANIZATION OF RESOURCE COUNCILS,5.00
...,...,...
761,BOEING COMPANY,37859963.38
2322,INTERNATIONAL UNION OF OPERATING ENGINEERS,41619196.09
304,AMERICAN FEDERATION OF TEACHERS,42991177.27
572,"AT&T SERVICES, INC. AND ITS AFFILIATES",45298888.28


In [11]:
# Export just the dataframe itself rather than the groupby
combined_df.to_csv("../contributions_2013-2022.csv",index=False)