In [103]:

import pandas as pd
import glob

In [104]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

In [105]:
files = sorted(glob.glob("df_*.csv"))

In [106]:
dfs = [pd.read_csv(f) for f in files]

In [107]:
# Concatenate into one DataFrame
df_final = pd.concat(dfs, ignore_index=True)

In [108]:
df_final = df_final.drop(columns=['NAME'])


In [109]:
df_final.columns = [col.strip().lower().replace(' ', '_') for col in df_final.columns]

In [110]:
df_final = df_final.rename(columns={"state": "state_name"})

In [111]:
# Remove columns if they exist that are from another CSV and not needed
cols_to_remove = [
    'total_population',
    'population_below_poverty',
    'percent_below_poverty',
    'adults_with_high_school',
    'percent_adults_with_high_school',
    'adults_with_college_degree',
    'percent_adults_with_college_degree'
]
df_final = df_final.drop(columns=[col for col in cols_to_remove if col in df_final.columns])


In [112]:
print(df_final.shape)
df_final.head()

(8622, 8)


Unnamed: 0,geo_id,state_name,county,year,adults(18-24)_with_high_school,percent_adults(18-24)_with_high_school,adults(18-24)_with_college_degree,total_median_earnings
0,0500000US01007,Alabama,Bibb,2015,681.0,33.1,25.0,29113
1,0500000US01009,Alabama,Blount,2015,1538.0,33.3,138.0,34549
2,0500000US01015,Alabama,Calhoun,2015,3362.0,28.3,570.0,28449
3,0500000US01017,Alabama,Chambers,2015,1234.0,42.6,114.0,28983
4,0500000US01019,Alabama,Cherokee,2015,739.0,38.3,107.0,30545


In [113]:
states_to_keep = [
    "New York", "Pennsylvania", "Maryland", "West Virginia", "Virginia",
    "Kentucky", "Ohio", "Tennessee", "North Carolina", "South Carolina",
    "Georgia", "Alabama", "Mississippi"
]
df_final = df_final[df_final['state_name'].isin(states_to_keep)].reset_index(drop=True)


In [114]:
print(df_final.shape)
df_final.head()

(6336, 8)


Unnamed: 0,geo_id,state_name,county,year,adults(18-24)_with_high_school,percent_adults(18-24)_with_high_school,adults(18-24)_with_college_degree,total_median_earnings
0,0500000US01007,Alabama,Bibb,2015,681.0,33.1,25.0,29113
1,0500000US01009,Alabama,Blount,2015,1538.0,33.3,138.0,34549
2,0500000US01015,Alabama,Calhoun,2015,3362.0,28.3,570.0,28449
3,0500000US01017,Alabama,Chambers,2015,1234.0,42.6,114.0,28983
4,0500000US01019,Alabama,Cherokee,2015,739.0,38.3,107.0,30545


In [115]:
#Read the appalachian counties file
app_counties = pd.read_csv(r"C:\Users\athar\OneDrive\Desktop\fulton_ring\fahe\appalachian_counties.csv", dtype={'fips': str})

In [116]:
# Extract last 5 digits of GEO_ID for matching
df_final['geo_id_last5'] = df_final['geo_id'].str[-5:]

In [117]:
# Some fips codes may have leading zeros, so ensure last 5 digits match
app_counties['county_fips_last5'] = app_counties['fips'].str[-5:]

In [118]:
# Filter df_final to keep only rows where geo_id_last5 is in fips_last5
df_final = df_final[df_final['geo_id_last5'].isin(app_counties['county_fips_last5'])].reset_index(drop=True)

In [119]:
print(df_final.shape)
df_final.head()

(5544, 9)


Unnamed: 0,geo_id,state_name,county,year,adults(18-24)_with_high_school,percent_adults(18-24)_with_high_school,adults(18-24)_with_college_degree,total_median_earnings,geo_id_last5
0,0500000US01007,Alabama,Bibb,2015,681.0,33.1,25.0,29113,1007
1,0500000US01009,Alabama,Blount,2015,1538.0,33.3,138.0,34549,1009
2,0500000US01015,Alabama,Calhoun,2015,3362.0,28.3,570.0,28449,1015
3,0500000US01017,Alabama,Chambers,2015,1234.0,42.6,114.0,28983,1017
4,0500000US01019,Alabama,Cherokee,2015,739.0,38.3,107.0,30545,1019


In [120]:
df_final = df_final.drop(columns=['geo_id_last5'])

In [121]:
print(df_final.shape)
df_final.head()

(5544, 8)


Unnamed: 0,geo_id,state_name,county,year,adults(18-24)_with_high_school,percent_adults(18-24)_with_high_school,adults(18-24)_with_college_degree,total_median_earnings
0,0500000US01007,Alabama,Bibb,2015,681.0,33.1,25.0,29113
1,0500000US01009,Alabama,Blount,2015,1538.0,33.3,138.0,34549
2,0500000US01015,Alabama,Calhoun,2015,3362.0,28.3,570.0,28449
3,0500000US01017,Alabama,Chambers,2015,1234.0,42.6,114.0,28983
4,0500000US01019,Alabama,Cherokee,2015,739.0,38.3,107.0,30545


In [122]:
df_final.to_csv("education.csv", index=False)