### Fairness of prediction between rich and poor counties in the US using Prophet - Part 1 (Prepare the data)

In [1]:
import tables
import pathlib
import warnings
from os import walk
import pandas as pd
import import_ipynb
from methods import *
import geopandas as gpd
from shapely.geometry import Point

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

importing Jupyter notebook from methods.ipynb


  shapely_geos_version, geos_capi_version_string


#### Read and load vehicle + accident datasets from FARS (1995 to 2020)

In [2]:
FARSpath = "/data/fiona123/ProjectData_TrafficFatality/"

In [3]:
d = {}
d['vehicle'] = read_vehicledata(FARSpath, d)

/data/fiona123/ProjectData_TrafficFatality/FARS2015NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2013NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2016NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2012NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2020NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2005NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2010NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS1995NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2018NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2014NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2019NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2017NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2000NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2011NationalCSV
dict_keys(['2015_vehicle', '2013_VEHICLE', '2016_Vehicle', '2012_VEHICLE', '2020_vehiclesf', '2020_vehicle', '2020_pve

In [4]:
d['accident'] = read_accidentdata(FARSpath, d)

/data/fiona123/ProjectData_TrafficFatality/FARS2015NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2013NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2016NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2012NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2020NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2005NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2010NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS1995NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2018NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2014NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2019NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2017NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2000NationalCSV
/data/fiona123/ProjectData_TrafficFatality/FARS2011NationalCSV


#### Preprocessing of datasets

In [5]:
# merge the vehicle and accident dataset w.r.t "ST_CASE"

merged_df = d['vehicle'].merge(d['accident'][['ST_CASE', 'LATITUDE', 'LONGITUD', 'YEAR']], on = 'ST_CASE', how = 'left')
display(merged_df.head(5))

Unnamed: 0,STATE,STATENAME,ST_CASE,VEH_NO,VE_FORMS,NUMOCCS,NUMOCCSNAME,DAY,DAYNAME,MONTH,MONTHNAME,HOUR,HOURNAME,MINUTE,MINUTENAME,HARM_EV,HARM_EVNAME,MAN_COLL,MAN_COLLNAME,UNITTYPE,UNITTYPENAME,HIT_RUN,HIT_RUNNAME,REG_STAT,REG_STATNAME,OWNER,OWNERNAME,MAKE,MAKENAME,MODEL,MAK_MOD,MAK_MODNAME,BODY_TYP,BODY_TYPNAME,MOD_YEAR,MOD_YEARNAME,VIN,VINNAME,VIN_1,VIN_2,VIN_3,VIN_4,VIN_5,VIN_6,VIN_7,VIN_8,VIN_9,VIN_10,VIN_11,VIN_12,TOW_VEH,TOW_VEHNAME,J_KNIFE,J_KNIFENAME,MCARR_I1,MCARR_I1NAME,MCARR_I2,MCARR_I2NAME,MCARR_ID,MCARR_IDNAME,GVWR,GVWRNAME,V_CONFIG,V_CONFIGNAME,CARGO_BT,CARGO_BTNAME,HAZ_INV,HAZ_INVNAME,HAZ_PLAC,HAZ_PLACNAME,HAZ_ID,HAZ_IDNAME,HAZ_CNO,HAZ_CNONAME,HAZ_REL,HAZ_RELNAME,BUS_USE,BUS_USENAME,SPEC_USE,SPEC_USENAME,EMER_USE,EMER_USENAME,TRAV_SP,TRAV_SPNAME,UNDERIDE,UNDERIDENAME,ROLLOVER,ROLLOVERNAME,ROLINLOC,ROLINLOCNAME,IMPACT1,IMPACT1NAME,DEFORMED,DEFORMEDNAME,TOWED,TOWEDNAME,M_HARM,M_HARMNAME,VEH_SC1,VEH_SC1NAME,VEH_SC2,VEH_SC2NAME,FIRE_EXP,FIRE_EXPNAME,DR_PRES,DR_PRESNAME,L_STATE,L_STATENAME,DR_ZIP,DR_ZIPNAME,L_STATUS,L_STATUSNAME,L_TYPE,L_TYPENAME,CDL_STAT,CDL_STATNAME,L_ENDORS,L_ENDORSNAME,L_COMPL,L_COMPLNAME,L_RESTRI,L_RESTRINAME,DR_HGT,DR_HGTNAME,DR_WGT,DR_WGTNAME,PREV_ACC,PREV_ACCNAME,PREV_SUS,PREV_SUSNAME,PREV_DWI,PREV_DWINAME,PREV_SPD,PREV_SPDNAME,PREV_OTH,PREV_OTHNAME,FIRST_MO,FIRST_MONAME,FIRST_YR,FIRST_YRNAME,LAST_MO,LAST_MONAME,LAST_YR,LAST_YRNAME,SPEEDREL,SPEEDRELNAME,DR_SF1,DR_SF1NAME,DR_SF2,DR_SF2NAME,DR_SF3,DR_SF3NAME,DR_SF4,DR_SF4NAME,VTRAFWAY,VTRAFWAYNAME,VNUM_LAN,VNUM_LANNAME,VSPD_LIM,VSPD_LIMNAME,VALIGN,VALIGNNAME,VPROFILE,VPROFILENAME,VPAVETYP,VPAVETYPNAME,VSURCOND,VSURCONDNAME,VTRAFCON,VTRAFCONNAME,VTCONT_F,VTCONT_FNAME,P_CRASH1,P_CRASH1NAME,P_CRASH2,P_CRASH2NAME,P_CRASH3,P_CRASH3NAME,PCRASH4,PCRASH4NAME,PCRASH5,PCRASH5NAME,ACC_TYPE,ACC_TYPENAME,DEATHS,DR_DRINK,DR_DRINKNAME,TRLR1VIN,TRLR1VINNAME,TRLR2VIN,TRLR2VINNAME,TRLR3VIN,TRLR3VINNAME,VINTYPE,VINMAKE,VINA_MOD,VIN_BT,VINMODYR,VIN_LNGT,VIN_WGT,WGTCD_TR,WHLBS_LG,WHLBS_SH,SER_TR,FUELCODE,MCYCL_DS,CARBUR,CYLINDER,DISPLACE,MCYCL_CY,MCYCL_WT,TIRE_SZE,TON_RAT,TRK_WT,TRKWTVAR,VIN_REST,WHLDRWHL,PREV_SUS1,PREV_SUS1NAME,PREV_SUS2,PREV_SUS2NAME,PREV_SUS3,PREV_SUS3NAME,VPICMAKE,VPICMAKENAME,VPICMODEL,VPICMODELNAME,VPICBODYCLASS,VPICBODYCLASSNAME,ICFINALBODY,ICFINALBODYNAME,GVWR_FROM,GVWR_FROMNAME,GVWR_TO,GVWR_TONAME,TRLR1GVWR,TRLR1GVWRNAME,TRLR2GVWR,TRLR2GVWRNAME,TRLR3GVWR,TRLR3GVWRNAME,OCUPANTS,HAZ_CARG,AXLES,IMPACT2,IMPACTS,TOWAWAY,VEH_CF1,VEH_CF2,VEH_MAN,AVOID,SEQ1,SEQ2,SEQ3,SEQ4,SEQ5,SEQ6,FLDCD_TR,VIOLCHG1,VIOLCHG2,VIOLCHG3,DR_CF1,DR_CF2,DR_CF3,DR_CF4,VIOL_CHG,LATITUDE,LONGITUD,YEAR
0,1,Alabama,10001,1,1,1.0,1,1.0,1.0,1,January,2.0,2:00am-2:59am,40.0,40,35,Embankment,0,Not a Collision with Motor Vehicle In-Transport,1.0,Motor Vehicle In-Transport (Inside or Outside ...,0,No,1,Alabama,1,Driver (in this crash) was Registered Owner,12,Ford,481,12481,Ford F-Series pickup,31,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",2003,2003,1FTRX18L83NB,1FTRX18L83NB,1,F,T,R,X,1,8,L,8,3,N,B,0,No Trailing Units,0,Not an Articulated Vehicle,0.0,Not Applicable,0,Not Applicable,0,Not Applicable,0.0,Not Applicable,0,Not Applicable,0,Not Applicable (N/A),1.0,No,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not a Bus,0,No Special Use,0,Not Applicable,55,055 MPH,0,No Underride or Override Noted,0,No Rollover,0.0,No Rollover,12,12 Clock Point,6,Disabling Damage,2.0,Towed Due to Disabling Damage,42,Tree (Standing Only),0.0,,0.0,,0,No or Not Reported,1,Yes,1.0,Alabama,35578.0,35578,6.0,Valid,1.0,Full Driver License,0.0,No (CDL),0.0,No Endorsements required for this vehicle,3.0,Valid license for this class vehicle,0.0,No Restrictions or Not Applicable,73.0,73,205.0,205 lbs.,1.0,1,0.0,,0.0,,1.0,1,0.0,,6.0,June,2011.0,2011,6.0,June,2012.0,2012,0.0,No,0.0,,0.0,,0.0,,0.0,,1.0,"Two-Way, Not Divided",2.0,Two lanes,55.0,55 MPH,3.0,Curve Left,1.0,Level,2.0,"Blacktop, Bituminous, or Asphalt",1.0,Dry,0.0,No Controls,0.0,No Controls,14.0,Negotiating a Curve,13.0,Off the edge of the road on the right side,99.0,Unknown,1.0,Tracking,4.0,Departed roadway,1.0,A1-Single Driver-Right Roadside Departure-Driv...,1,1,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33.878653,-87.325328,2015
1,1,Alabama,10001,1,1,1.0,1,1.0,1.0,1,January,2.0,2:00am-2:59am,40.0,40,35,Embankment,0,Not a Collision with Motor Vehicle In-Transport,1.0,Motor Vehicle In-Transport (Inside or Outside ...,0,No,1,Alabama,1,Driver (in this crash) was Registered Owner,12,Ford,481,12481,Ford F-Series pickup,31,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",2003,2003,1FTRX18L83NB,1FTRX18L83NB,1,F,T,R,X,1,8,L,8,3,N,B,0,No Trailing Units,0,Not an Articulated Vehicle,0.0,Not Applicable,0,Not Applicable,0,Not Applicable,0.0,Not Applicable,0,Not Applicable,0,Not Applicable (N/A),1.0,No,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not a Bus,0,No Special Use,0,Not Applicable,55,055 MPH,0,No Underride or Override Noted,0,No Rollover,0.0,No Rollover,12,12 Clock Point,6,Disabling Damage,2.0,Towed Due to Disabling Damage,42,Tree (Standing Only),0.0,,0.0,,0,No or Not Reported,1,Yes,1.0,Alabama,35578.0,35578,6.0,Valid,1.0,Full Driver License,0.0,No (CDL),0.0,No Endorsements required for this vehicle,3.0,Valid license for this class vehicle,0.0,No Restrictions or Not Applicable,73.0,73,205.0,205 lbs.,1.0,1,0.0,,0.0,,1.0,1,0.0,,6.0,June,2011.0,2011,6.0,June,2012.0,2012,0.0,No,0.0,,0.0,,0.0,,0.0,,1.0,"Two-Way, Not Divided",2.0,Two lanes,55.0,55 MPH,3.0,Curve Left,1.0,Level,2.0,"Blacktop, Bituminous, or Asphalt",1.0,Dry,0.0,No Controls,0.0,No Controls,14.0,Negotiating a Curve,13.0,Off the edge of the road on the right side,99.0,Unknown,1.0,Tracking,4.0,Departed roadway,1.0,A1-Single Driver-Right Roadside Departure-Driv...,1,1,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33.791964,-86.383703,2013
2,1,Alabama,10001,1,1,1.0,1,1.0,1.0,1,January,2.0,2:00am-2:59am,40.0,40,35,Embankment,0,Not a Collision with Motor Vehicle In-Transport,1.0,Motor Vehicle In-Transport (Inside or Outside ...,0,No,1,Alabama,1,Driver (in this crash) was Registered Owner,12,Ford,481,12481,Ford F-Series pickup,31,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",2003,2003,1FTRX18L83NB,1FTRX18L83NB,1,F,T,R,X,1,8,L,8,3,N,B,0,No Trailing Units,0,Not an Articulated Vehicle,0.0,Not Applicable,0,Not Applicable,0,Not Applicable,0.0,Not Applicable,0,Not Applicable,0,Not Applicable (N/A),1.0,No,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not a Bus,0,No Special Use,0,Not Applicable,55,055 MPH,0,No Underride or Override Noted,0,No Rollover,0.0,No Rollover,12,12 Clock Point,6,Disabling Damage,2.0,Towed Due to Disabling Damage,42,Tree (Standing Only),0.0,,0.0,,0,No or Not Reported,1,Yes,1.0,Alabama,35578.0,35578,6.0,Valid,1.0,Full Driver License,0.0,No (CDL),0.0,No Endorsements required for this vehicle,3.0,Valid license for this class vehicle,0.0,No Restrictions or Not Applicable,73.0,73,205.0,205 lbs.,1.0,1,0.0,,0.0,,1.0,1,0.0,,6.0,June,2011.0,2011,6.0,June,2012.0,2012,0.0,No,0.0,,0.0,,0.0,,0.0,,1.0,"Two-Way, Not Divided",2.0,Two lanes,55.0,55 MPH,3.0,Curve Left,1.0,Level,2.0,"Blacktop, Bituminous, or Asphalt",1.0,Dry,0.0,No Controls,0.0,No Controls,14.0,Negotiating a Curve,13.0,Off the edge of the road on the right side,99.0,Unknown,1.0,Tracking,4.0,Departed roadway,1.0,A1-Single Driver-Right Roadside Departure-Driv...,1,1,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33.426458,-86.819731,2016
3,1,Alabama,10001,1,1,1.0,1,1.0,1.0,1,January,2.0,2:00am-2:59am,40.0,40,35,Embankment,0,Not a Collision with Motor Vehicle In-Transport,1.0,Motor Vehicle In-Transport (Inside or Outside ...,0,No,1,Alabama,1,Driver (in this crash) was Registered Owner,12,Ford,481,12481,Ford F-Series pickup,31,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",2003,2003,1FTRX18L83NB,1FTRX18L83NB,1,F,T,R,X,1,8,L,8,3,N,B,0,No Trailing Units,0,Not an Articulated Vehicle,0.0,Not Applicable,0,Not Applicable,0,Not Applicable,0.0,Not Applicable,0,Not Applicable,0,Not Applicable (N/A),1.0,No,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not a Bus,0,No Special Use,0,Not Applicable,55,055 MPH,0,No Underride or Override Noted,0,No Rollover,0.0,No Rollover,12,12 Clock Point,6,Disabling Damage,2.0,Towed Due to Disabling Damage,42,Tree (Standing Only),0.0,,0.0,,0,No or Not Reported,1,Yes,1.0,Alabama,35578.0,35578,6.0,Valid,1.0,Full Driver License,0.0,No (CDL),0.0,No Endorsements required for this vehicle,3.0,Valid license for this class vehicle,0.0,No Restrictions or Not Applicable,73.0,73,205.0,205 lbs.,1.0,1,0.0,,0.0,,1.0,1,0.0,,6.0,June,2011.0,2011,6.0,June,2012.0,2012,0.0,No,0.0,,0.0,,0.0,,0.0,,1.0,"Two-Way, Not Divided",2.0,Two lanes,55.0,55 MPH,3.0,Curve Left,1.0,Level,2.0,"Blacktop, Bituminous, or Asphalt",1.0,Dry,0.0,No Controls,0.0,No Controls,14.0,Negotiating a Curve,13.0,Off the edge of the road on the right side,99.0,Unknown,1.0,Tracking,4.0,Departed roadway,1.0,A1-Single Driver-Right Roadside Departure-Driv...,1,1,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.701317,-85.525181,2012
4,1,Alabama,10001,1,1,1.0,1,1.0,1.0,1,January,2.0,2:00am-2:59am,40.0,40,35,Embankment,0,Not a Collision with Motor Vehicle In-Transport,1.0,Motor Vehicle In-Transport (Inside or Outside ...,0,No,1,Alabama,1,Driver (in this crash) was Registered Owner,12,Ford,481,12481,Ford F-Series pickup,31,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",2003,2003,1FTRX18L83NB,1FTRX18L83NB,1,F,T,R,X,1,8,L,8,3,N,B,0,No Trailing Units,0,Not an Articulated Vehicle,0.0,Not Applicable,0,Not Applicable,0,Not Applicable,0.0,Not Applicable,0,Not Applicable,0,Not Applicable (N/A),1.0,No,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not Applicable,0.0,Not a Bus,0,No Special Use,0,Not Applicable,55,055 MPH,0,No Underride or Override Noted,0,No Rollover,0.0,No Rollover,12,12 Clock Point,6,Disabling Damage,2.0,Towed Due to Disabling Damage,42,Tree (Standing Only),0.0,,0.0,,0,No or Not Reported,1,Yes,1.0,Alabama,35578.0,35578,6.0,Valid,1.0,Full Driver License,0.0,No (CDL),0.0,No Endorsements required for this vehicle,3.0,Valid license for this class vehicle,0.0,No Restrictions or Not Applicable,73.0,73,205.0,205 lbs.,1.0,1,0.0,,0.0,,1.0,1,0.0,,6.0,June,2011.0,2011,6.0,June,2012.0,2012,0.0,No,0.0,,0.0,,0.0,,0.0,,1.0,"Two-Way, Not Divided",2.0,Two lanes,55.0,55 MPH,3.0,Curve Left,1.0,Level,2.0,"Blacktop, Bituminous, or Asphalt",1.0,Dry,0.0,No Controls,0.0,No Controls,14.0,Negotiating a Curve,13.0,Off the edge of the road on the right side,99.0,Unknown,1.0,Tracking,4.0,Departed roadway,1.0,A1-Single Driver-Right Roadside Departure-Driv...,1,1,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.433133,-86.09485,2020


In [6]:
cols_to_keep = ["STATE", "STATENAME", "ST_CASE", "DAY", "MONTH", "MAKENAME", "MAK_MODNAME", "MOD_YEAR",
               "LATITUDE", "LONGITUD", "YEAR", "VPICBODYCLASSNAME"]
merged_df = merged_df[cols_to_keep]
print(len(merged_df))
display(merged_df.head(5))

9160995


Unnamed: 0,STATE,STATENAME,ST_CASE,DAY,MONTH,MAKENAME,MAK_MODNAME,MOD_YEAR,LATITUDE,LONGITUD,YEAR,VPICBODYCLASSNAME
0,1,Alabama,10001,1.0,1,Ford,Ford F-Series pickup,2003,33.878653,-87.325328,2015,
1,1,Alabama,10001,1.0,1,Ford,Ford F-Series pickup,2003,33.791964,-86.383703,2013,
2,1,Alabama,10001,1.0,1,Ford,Ford F-Series pickup,2003,33.426458,-86.819731,2016,
3,1,Alabama,10001,1.0,1,Ford,Ford F-Series pickup,2003,32.701317,-85.525181,2012,
4,1,Alabama,10001,1.0,1,Ford,Ford F-Series pickup,2003,32.433133,-86.09485,2020,


In [7]:
# Read US geojson (counties shapefile)

us_geo_shape = gpd.read_file("/data/fiona123/ProjectData/SVI2018_US_COUNTY/SVI2018_US_county.shp")
cols = list(us_geo_shape.columns)
cols_to_keep = cols[0:6] + cols[97:98] + cols[123:126]
us_geo_shape = us_geo_shape[cols_to_keep]
display(us_geo_shape.head(5))

Unnamed: 0,ST,STATE,ST_ABBR,COUNTY,FIPS,LOCATION,RPL_THEMES,Shape_STAr,Shape_STLe,geometry
0,35,NEW MEXICO,NM,Rio Arriba,35039,"Rio Arriba County, New Mexico",-999.0,1.536344,6.45281,"POLYGON ((-107.62554 36.56587, -107.62523 36.5..."
1,1,ALABAMA,AL,Autauga,1001,"Autauga County, Alabama",0.4354,0.150256,2.05274,"POLYGON ((-86.92120 32.65754, -86.92035 32.658..."
2,1,ALABAMA,AL,Blount,1009,"Blount County, Alabama",0.4242,0.164403,2.392326,"POLYGON ((-86.96336 33.85822, -86.95967 33.857..."
3,1,ALABAMA,AL,Butler,1013,"Butler County, Alabama",0.8653,0.191747,1.818327,"POLYGON ((-86.90894 31.96167, -86.87498 31.961..."
4,1,ALABAMA,AL,Calhoun,1015,"Calhoun County, Alabama",0.8252,0.154336,2.194795,"POLYGON ((-86.14622 33.70218, -86.14577 33.704..."


In [8]:
# Filter US accident + vehicle dataframe based on counties shapefile

geometry = [Point(xy) for xy in zip(merged_df.LONGITUD, merged_df.LATITUDE)]
crs = {'init' :'epsg:4326'}
gdf = gpd.GeoDataFrame(merged_df, crs=crs, geometry=geometry)

In [9]:
gdf = gdf.to_crs(epsg=4269)
merged_file = gpd.sjoin(gdf, us_geo_shape, how='left', op='within')
df = pd.DataFrame(merged_file)
tract_df = df.copy()

In [10]:
# Remove outliers
tract_df = tract_df.fillna(0)   
tract_df = tract_df[tract_df.YEAR >= 2000]
tract_df = tract_df[tract_df.LOCATION != 0]

tract_df['DAY'] = tract_df['DAY'].astype(int)
tract_df['year'] = tract_df['YEAR']
tract_df['month'] = tract_df['MONTH']
tract_df['day'] = tract_df['DAY']

# Create a datetime column
tract_df['datetime'] = pd.to_datetime(tract_df[['year', 'month', 'day']],errors='coerce')
tract_df.drop(['year','month','day'], axis=1, inplace=True)
print(len(tract_df))
tract_df.head(5)

7039951


Unnamed: 0,STATE_left,STATENAME,ST_CASE,DAY,MONTH,MAKENAME,MAK_MODNAME,MOD_YEAR,LATITUDE,LONGITUD,YEAR,VPICBODYCLASSNAME,geometry,index_right,ST,STATE_right,ST_ABBR,COUNTY,FIPS,LOCATION,RPL_THEMES,Shape_STAr,Shape_STLe,datetime
0,1,Alabama,10001,1,1,Ford,Ford F-Series pickup,2003,33.878653,-87.325328,2015,0,POINT (-87.32533 33.87865),2135.0,1,ALABAMA,AL,Walker,1127,"Walker County, Alabama",0.7452,0.203015,2.381151,2015-01-01
1,1,Alabama,10001,1,1,Ford,Ford F-Series pickup,2003,33.791964,-86.383703,2013,0,POINT (-86.38370 33.79196),25.0,1,ALABAMA,AL,St. Clair,1115,"St. Clair County, Alabama",0.3656,0.164596,2.412168,2013-01-01
2,1,Alabama,10001,1,1,Ford,Ford F-Series pickup,2003,33.426458,-86.819731,2016,0,POINT (-86.81973 33.42646),1323.0,1,ALABAMA,AL,Jefferson,1073,"Jefferson County, Alabama",0.6621,0.282658,2.897314,2016-01-01
3,1,Alabama,10001,1,1,Ford,Ford F-Series pickup,2003,32.701317,-85.525181,2012,0,POINT (-85.52518 32.70132),1325.0,1,ALABAMA,AL,Lee,1081,"Lee County, Alabama",0.6602,0.153223,2.022137,2012-01-01
4,1,Alabama,10001,1,1,Ford,Ford F-Series pickup,2003,32.433133,-86.09485,2020,0,POINT (-86.09485 32.43313),11.0,1,ALABAMA,AL,Elmore,1051,"Elmore County, Alabama",0.5401,0.163463,2.254471,2020-01-01


In [None]:
tract_df.drop(['STATE_left','ST_CASE','DAY', 'MONTH', 'index_right', 'ST', 'STATE_right', 'ST_ABBR', 'COUNTY',
              'Shape_STAr', 'Shape_STLe', 'geometry'], axis=1, inplace=True)
tract_df.head(5)

In [None]:
# Save the final dataframe that is the input for forecasting model to check fairness

tract_df.to_hdf(r'tracts_fairness.h5', key='stage', mode='w')