In [35]:
#Notebook to clean county unemployment rate data
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [36]:
# manually cleaned column headers and removed citation footnote prior to import
county_df = pd.read_excel('laucnty13.xlsx')
county_df = county_df.drop(['Year', 'Unnamed: 5', 'Labor Force', 'Employed', 'Unemployed'], axis=1)

In [37]:
# loop through the remaining files and merge with the county_df
for year in range(14, 23):  # from 2014 to 2022
    file_path = f'laucnty{year}.xlsx'
    
    # 1. Read in the spreadsheet
    df_temp_simple = pd.read_excel(file_path)
    
    # 2. Drop the specified columns
    df_temp_simple = df_temp_simple.drop(['Year', 'Unnamed: 5', 'Labor Force', 'Employed', 'Unemployed'], axis=1)
    
    # 3. Merge with county_df_simple on the consistent columns
    county_df = pd.merge(county_df, df_temp_simple, on=["LAUS Code", "State Fips", "County Fips", "County Name/State Abbreviation"], how="outer")



In [38]:
#convert state and county fip to strings, add leading zeros to county fips, and combine to create fips column
county_df[['State Fips', 'County Fips']] = county_df[['State Fips', 'County Fips']].astype(int)
county_df[['State Fips', 'County Fips']] = county_df[['State Fips', 'County Fips']].astype(str)
county_df['County Fips'] = county_df['County Fips'].str.zfill(3)
county_df['FIPS'] = county_df['State Fips'] + county_df['County Fips']

#move FIPS column to 4th column
cols = county_df.columns.tolist()
cols = cols[:3] + cols[-1:] + cols[3:-1]
county_df = county_df[cols]

#rename year columns
for year in range(2013, 2023):
    county_df.rename(columns={f"Unemployment Rate {year}": f"UR_{year}"}, inplace=True)
    


In [40]:
#sort by FIPS in case new counties were added after 2014. First, convert FIPS to int, then sort, then convert back to string. Also, make sure UR columns are floats
county_df['FIPS'] = county_df['FIPS'].astype(int)
county_df.sort_values(by=['FIPS'], inplace=True)
county_df['FIPS'] = county_df['FIPS'].astype(str)
county_df.replace('N.A.', np.nan, inplace=True)
county_df[['UR_2013', 'UR_2014', 'UR_2015', 'UR_2016', 'UR_2017', 'UR_2018', 'UR_2019', 'UR_2020', 'UR_2021', 'UR_2022']] = county_df[['UR_2013', 'UR_2014', 'UR_2015', 'UR_2016', 'UR_2017', 'UR_2018', 'UR_2019', 'UR_2020', 'UR_2021', 'UR_2022']].astype(float)

In [41]:
# export county_df to csv
county_df.to_csv('county_ur.csv', index=False)