In [106]:
# Notebook to subset senate_elections.csv to only include elections from 2016 onwards and relevant columns and then merge with demographic data from census and use mit data for 2022 elections
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [107]:
senate_elections = pd.read_csv("Election Data\senate_elections.csv", low_memory=False)
senate_elections = senate_elections[senate_elections["election_year"] >= 2016]
senate_elections = senate_elections[["fips", "democratic_raw_votes", "dem_nominee", "republican_raw_votes", "rep_nominee", "raw_county_vote_totals", "election_year"]]

sen_mit = pd.read_csv("Election Data\senate_2022.csv", low_memory=False)
sen_mit = sen_mit[["year", "county_fips", "candidate", "party_simplified", "candidatevotes", "totalvotes"]]

In [108]:
#rename fips to FIPS and election_year to YEAR
senate_elections = senate_elections.rename(columns={"fips": "FIPS", "election_year": "YEAR"})
sen_mit = sen_mit.rename(columns={"county_fips": "FIPS", "year": "YEAR",  "party_simplified": "party"})

In [109]:
#create raw_county_vote_totals column in sen_mit
sen_mit["raw_county_vote_totals"] = sen_mit.groupby(["FIPS", "YEAR"])["candidatevotes"].transform("sum")


In [110]:
#sen_mit to have each county as a row. One column for the democratic candidate, one column for the republican candidate, one column for each candidate's votes, and one column for total votes
sen_mit_dem = sen_mit[sen_mit["party"] == "DEMOCRAT"]
sen_mit_rep = sen_mit[sen_mit["party"] == "REPUBLICAN"]

sen_mit_dem = sen_mit_dem.rename(columns={"candidate": "dem_nominee", "candidatevotes": "democratic_raw_votes"})
sen_mit_rep = sen_mit_rep.rename(columns={"candidate": "rep_nominee", "candidatevotes": "republican_raw_votes"})

sen_mit_dem = sen_mit_dem[["FIPS", "dem_nominee", "democratic_raw_votes", "YEAR"]]

sen_mit_dem = sen_mit_dem.groupby(["FIPS", "YEAR", "dem_nominee"]).agg(democratic_raw_votes=("democratic_raw_votes", "sum")).reset_index()
sen_mit_rep = sen_mit_rep.groupby(["FIPS", "YEAR", "rep_nominee"]).agg(republican_raw_votes=("republican_raw_votes", "sum"), raw_county_vote_totals=("raw_county_vote_totals", "max")).reset_index()

#merge back sen_mit_dem and sen_mit_rep on FIPS and YEAR
sen_mit = pd.merge(sen_mit_dem, sen_mit_rep, on=["FIPS", "YEAR"])

#concatenate the two dataframes and drop party column
senate_elections = pd.concat([senate_elections, sen_mit], ignore_index=True)

#reset index
senate_elections = senate_elections.reset_index(drop=True)

In [111]:
#load and merge with county_long
county_long = pd.read_csv("county_final_long.csv", low_memory=False)
sen_df = pd.merge(senate_elections, county_long, on=["FIPS", "YEAR"], how="outer")


In [112]:
#Make STNAME, CTYNAME, FIPS, YEAR the first four columns
cols_order = ['STNAME', 'CTYNAME', 'FIPS', 'YEAR'] + [col for col in sen_df.columns if col not in ['STNAME', 'CTYNAME', 'FIPS', 'YEAR']]

# Reorder the columns
sen_df = sen_df[cols_order]


In [113]:
#Sort by year, state, county
sen_df = sen_df.sort_values(by=["YEAR", "STNAME", "CTYNAME"])

In [114]:
#export to csv
sen_df.to_csv("senate_county_long.csv", index=False)

In [None]:
#Create wide version of senate data with extra columns for analysis

#make sure YEAR is an integer
sen_df['YEAR'] = sen_df['YEAR'].astype(int)


#get subset for 2 years prior to 2016 election and election year
sen_2016_x = sen_df[(sen_df['YEAR'] >= 2014) & (sen_df['YEAR'] <= 2016)]
sen_2016_y = sen_df[sen_df['YEAR'] == 2016]

#get subset for 2 years prior to 2018 election and election year
sen_2018_x = sen_df[(sen_df['YEAR'] >= 2016) & (sen_df['YEAR'] <= 2018)]
sen_2018_y = sen_df[sen_df['YEAR'] == 2018]

#get subset for 2 years prior to 2020 election and election year
sen_2020_x = sen_df[(sen_df['YEAR'] >= 2018) & (sen_df['YEAR'] <= 2020)]
sen_2020_y = sen_df[sen_df['YEAR'] == 2020]

#get subset for 2 years prior to 2022 election and election year
sen_2022_x = sen_df[(sen_df['YEAR'] >= 2020) & (sen_df['YEAR'] <= 2022)]
sen_2022_y = sen_df[sen_df['YEAR'] == 2022]
#modify dataframes to have relevant columns depending on x or y
cols = [ 'democratic_raw_votes',
 'dem_nominee',
 'republican_raw_votes',
 'rep_nominee',
 'raw_county_vote_totals']

cols_to_keep = ['FIPS', 'YEAR', 'STNAME', 'CTYNAME'] + cols

dfs_x = [sen_2016_x, sen_2018_x, sen_2020_x, sen_2022_x]
dfs_y = [sen_2016_y, sen_2018_y, sen_2020_y, sen_2022_y]

dfs_x = [df.drop(columns=cols) for df in dfs_x]
dfs_y = [df[cols_to_keep] for df in dfs_y]

for df in dfs_y:
    dem_pct = df['democratic_raw_votes'] / df['raw_county_vote_totals']
    rep_pct = df['republican_raw_votes'] / df['raw_county_vote_totals']
    df['dem_pct'] = dem_pct
    df['rep_pct'] = rep_pct

sen_2016_x, sen_2018_x, sen_2020_x, sen_2022_x = dfs_x
sen_2016_y, sen_2018_y, sen_2020_y, sen_2022_y = dfs_y


In [116]:
#Create function to pivot dataframes to have one row per county
def reshape_df(df, ref_year):
    exclude_cols=['STNAME', 'CTYNAME', 'FIPS', 'YEAR', 'democratic_raw_votes',
    'dem_nominee',
    'republican_raw_votes',
    'rep_nominee',
    'raw_county_vote_totals']

    pivot_cols = [col for col in df.columns if col not in exclude_cols]

    # Get all columns from exclude_cols except 'YEAR' for the index
    index_cols = ['STNAME', 'CTYNAME', 'FIPS']
    
    # Create a dynamic year mapping based on the reference year
    unique_years = sorted(df['YEAR'].unique())
    year_mapping = {year: f'year_{ref_year-year}' for year in unique_years}
    df['YEAR'] = df['YEAR'].map(year_mapping)
    
    # Pivot the dataframe using pivot_table
    df_pivot = df.pivot_table(index=index_cols, columns='YEAR', values=pivot_cols, aggfunc='first')
    
    # Flatten the MultiIndex after pivoting and reset index
    df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]
    df_pivot.reset_index(inplace=True)
    
    # Drop STNAME and CTYNAME columns
    df_pivot = df_pivot.drop(columns=['STNAME', 'CTYNAME'])
    df_pivot.reset_index(inplace=True)
    
    return df_pivot



In [117]:
# Apply the function to the dataframes
sen_2016_x_transformed = reshape_df(sen_2016_x, 2016)
sen_2018_x_transformed = reshape_df(sen_2018_x, 2018)
sen_2020_x_transformed = reshape_df(sen_2020_x, 2020)
sen_2022_x_transformed = reshape_df(sen_2022_x, 2022)
#Merge transformed x dataframes with y dataframes on FIPS
sen_2016 = pd.merge(sen_2016_y, sen_2016_x_transformed,  on='FIPS')
sen_2018 = pd.merge(sen_2018_y, sen_2018_x_transformed,  on='FIPS')
sen_2020 = pd.merge(sen_2020_y, sen_2020_x_transformed,  on='FIPS')
sen_2022 = pd.merge(sen_2022_y, sen_2022_x_transformed,  on='FIPS')

sen_2016.drop(columns=['index'], inplace=True)
sen_2018.drop(columns=['index'], inplace=True)
sen_2020.drop(columns=['index'], inplace=True)
sen_2022.drop(columns=['index'], inplace=True)

#concatenate all dataframes into one, drop rows with missing values
sen_df = pd.concat([sen_2016, sen_2018, sen_2020, sen_2022], axis=0)

#write to csv
sen_df.to_csv('senate_county_wide.csv', index=False)