In [21]:
#Merge 3 county dataframes into one dataframe
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Merge county_dem, county_pop, county_ur
final_df_dem = pd.read_csv('county_dem.csv')
df_pop = pd.read_csv('county_pop.csv')
df_ur = pd.read_csv('county_ur.csv')

df_pop= df_pop.drop(columns=['STNAME', 'CTYNAME', 'REGION', 'DIVISION', 'STATE', 'COUNTY'])
df_ur = df_ur.drop(columns=['LAUS Code', 'State Fips', 'County Fips', 'County Name/State Abbreviation'])

#drop connecticut from both dataframes before merging (all FIPS values start with 9). First convert FIPS to string
final_df_dem['FIPS'] = final_df_dem['FIPS'].astype(str)
df_pop['FIPS'] = df_pop['FIPS'].astype(str)
df_ur['FIPS'] = df_ur['FIPS'].astype(str)
df_pop = df_pop[~df_pop['FIPS'].str.startswith('9')]
df_ur = df_ur[~df_ur['FIPS'].str.startswith('9')]

#Check for differences in FIPS between demo, pop, and ur
# Convert the FIPS column to sets
fips_dem = set(final_df_dem['FIPS'])
fips_pop = set(df_pop['FIPS'])
fips_ur = set(df_ur['FIPS'])

# Identify FIPS values that are in dem but not in pop
missing_in_pop = fips_dem - fips_pop

# Identify FIPS values that are in pop but not in dem
missing_in_dem = fips_pop - fips_dem

#Identify FIPS values that are in dem but not in ur
missing_in_ur = fips_dem - fips_ur

#Identify FIPS values that are in ur but not in dem
missing_in_dem2 = fips_ur - fips_dem

print("FIPS values in dem dataframe but not in pop:", missing_in_pop)
print("FIPS values in pop dataframe but not in dem:", missing_in_dem)
print("FIPS values in dem dataframe but not in ur:", missing_in_ur)
print("FIPS values in ur dataframe but not in dem:", missing_in_dem2)

#Merge dataframes on FIPS. Use inner join to keep only FIPS values that are in all three dataframes
county_final = pd.merge(final_df_dem, df_pop, on='FIPS', how='inner')
county_final = pd.merge(county_final, df_ur, on='FIPS', how='inner')
county_final = county_final.reset_index(drop=True)

#export to csv
county_final.to_csv('county_final.csv', index=False)

#Now, also create long version of county_final

#First create columns to melt, date range, and id_vars
base_cols = ['TOT_POP', 'TOT_MALE',
       'TOT_FEMALE', 'WA_TOTAL',
       'BA_TOTAL', 'IA_TOTAL', 'AA_TOTAL', 'NA_TOTAL', 'TOM_TOTAL', 'NH_TOTAL',
       'H_TOTAL', 'POPESTIMATE', 'NPOPCHG', 'BIRTHS', 'DEATHS', 'INTERNATIONALMIG', 'DOMESTICMIG', 'RBIRTH', 'RDEATH', 'RINTERNATIONALMIG', 'RDOMESTICMIG', 'UR']

years = list(range(2013, 2023))  
id_vars_cols = ['FIPS', 'STNAME', 'CTYNAME']

#Melt dataframe

# Initialize a list to store each melted dataframe
melted_dfs = []

# Loop through each base column name to melt
for base_col in base_cols:
    # Generate column names for each year
    year_cols = []
    for year in years:
        # With underscore
        col_with_underscore = f"{base_col}_{year}"
        
        # Without underscore
        col_without_underscore = f"{base_col}{year}"
        
        # Check which one exists in the dataframe
        if col_with_underscore in county_final.columns:
            year_cols.append(col_with_underscore)
        elif col_without_underscore in county_final.columns:
            year_cols.append(col_without_underscore)
    
    # Filter out columns that might not exist in the dataframe
    valid_cols = [col for col in year_cols if col in county_final.columns]
    
    # Melt the dataframe for the current base column
    melted = county_final.melt(id_vars=id_vars_cols, 
                     value_vars=valid_cols, 
                     var_name='YEAR', 
                     value_name=base_col)
    
    # Extract the year part using a regular expression
    melted['YEAR'] = melted['YEAR'].str.extract('(\d{4})', expand=False)
    
    # Append to the list
    melted_dfs.append(melted)

# Drop the repeated columns from all but the first melted DataFrame
for i in range(1, len(melted_dfs)):
    melted_dfs[i] = melted_dfs[i].drop(columns=id_vars_cols + ['YEAR'])

# Concatenate the melted dataframes horizontally
result_df = pd.concat(melted_dfs, axis=1)
#reset index, drop index, and sort by FIPS and year (first convert FIPS and Year to int, then sort, then convert back to string)
result_df = result_df.reset_index(drop=True)

#convert FIPS and Year to int
result_df['FIPS'] = result_df['FIPS'].astype(int)
result_df['YEAR'] = result_df['YEAR'].astype(int)

#sort by FIPS and Year
result_df = result_df.sort_values(by=['FIPS', 'YEAR'])


#reset index again
result_df = result_df.reset_index(drop=True)

#convert FIPS and Year back to string
result_df['FIPS'] = result_df['FIPS'].astype(str)
result_df['YEAR'] = result_df['YEAR'].astype(str)

#create PCT Change columns
# List of columns to compute percentage change
cols_to_compute = ['TOT_POP', 'TOT_MALE',
       'TOT_FEMALE', 'WA_TOTAL',
       'BA_TOTAL', 'IA_TOTAL', 'AA_TOTAL', 'NA_TOTAL', 'TOM_TOTAL', 'NH_TOTAL',
       'H_TOTAL', 'POPESTIMATE', 'NPOPCHG', 'BIRTHS', 'DEATHS',
       'INTERNATIONALMIG', 'DOMESTICMIG', 'RBIRTH', 'RDEATH',
       'RINTERNATIONALMIG', 'RDOMESTICMIG', 'UR']

# Compute percentage change for each column
for col in cols_to_compute:
    change_col = col + "_CHANGE"
    result_df[change_col] = result_df.groupby('FIPS')[col].pct_change().fillna(0) * 100

#export to csv
county_final_long = result_df
county_final_long.to_csv('county_final_long.csv', index=False)

