In [50]:
import numpy as np
import os
import pandas as pd
import glob
import h5py
import tables
from ast import literal_eval

## Merging Data

This script takes raw data, create a pandas df and saves it to disk as a hdf5 file called `data.h5`. Before running this code, make sure your file directory looks like this...

```
.
│   Data Merged.ipynb
│   ...    
│
└───Data
    │   search_grid.csv
    │   impact_score_2017.csv
    │   impact_score_2017.csv
    |   impact_score_2019_public_test.csv
    |
    └───GFS_2017
    |   │   gfs_4_20170101_0000_000.csv
    |   │   ...
    |
    └───GFS_2018
    |   │   gfs_4_20180101_0000_000.csv
    |   │   ...     
    | 
    └───GFS_2019
        │   gfs_4_20190101_0000_000.csv
        │   ...     
   

```

# Loading Provided Data

In [51]:
#### Path to all data files

# All weather
weather17 = ["./Data/GFS_2017/" + f for f in os.listdir("./Data/GFS_2017")]
weather18 = ["./Data/GFS_2018/" + f for f in os.listdir("./Data/GFS_2018")]
weather19 = ["./Data/GFS_2019/" + f for f in os.listdir("./Data/GFS_2019")]
allweather = weather17 + weather18 + weather19

# All impact
allimpact = ["./Data/" + f for f in os.listdir("./Data") if f.startswith('impact')]

# Loc2zip map
grid = "./Data/search_grid.csv"

In [52]:
### Load weather into 1 df
# Concat into a giant dataframe
weather = pd.concat(map(pd.read_csv, allweather), ignore_index=True)

In [53]:
### Load and flatten loc2zip map
loc2zip = pd.read_csv(grid, converters={"mapped_zipcodes": literal_eval})
loc2zip = loc2zip.explode("mapped_zipcodes").reset_index()
loc2zip['mapped_zipcodes'] = pd.to_numeric(loc2zip['mapped_zipcodes'])
#loc2zip.rename(columns={"mapped_zipcodes":"zip5"}, inplace=True)

In [54]:
### Load impact into 1 df
impact = pd.concat(map(pd.read_csv, allimpact), ignore_index=True).drop(["Unnamed: 0"], axis=1)

In [55]:
### Join impact and loc2zip
impact_loc = impact.join(loc2zip.set_index('mapped_zipcodes'), on='zip5').drop(["index"], axis = 1)

In [56]:
### Join impact and weather in loc and date
# Preprocessing on join columns 
impact_loc["date_key"] = pd.to_datetime(impact_loc["date_key"])
weather["Date"] = pd.to_datetime(weather["Date"], format="%Y%m%d")

data = impact_loc.merge(weather, left_on=["date_key", "grid_lat", "grid_lon"], right_on=["Date", "lat", "lng"], how='left')
data = data.sort_values(by="Date")

In [57]:
impact_loc

Unnamed: 0,date_key,zip5,impact_score,grid_lat,grid_lon
0,2017-01-01,2722,20.268081,41.5,-71.0
1,2017-01-01,3063,31.794226,43.0,-71.5
2,2017-01-01,7008,22.809754,40.5,-74.0
3,2017-01-01,8518,8.621755,40.0,-75.0
4,2017-01-01,8691,21.099682,40.0,-74.5
...,...,...,...,...,...
98670,2019-12-31,97124,,45.5,-123.0
98671,2019-12-31,98032,,47.5,-122.5
98672,2019-12-31,98327,,47.0,-122.5
98673,2019-12-31,98390,,47.0,-122.0


## Adding Features from External Data Scources

In [58]:
### Add state, region, and urban/suburban information from FAR codes dataset
data = data.sort_values(["zip5", 'date_key', 'Time'])
x = data['zip5']
# Read in Far codes
far = pd.read_excel('FARcodesZIPdata2010WithAKandHI.xlsx', sheet_name = 1)
cols = ['ZIP', 'density', 'state']
far = far[cols]
new_row = {'ZIP': 2722, 'density': 0, 'state': 'MA'}
far = far.append(new_row, ignore_index=True)

In [59]:
# Add density and state
data = data.merge(far, left_on = 'zip5', right_on = 'ZIP')
y = data['zip5']

In [60]:
# State to region dictionary
New_England_Northeast = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT']
Mid_Atlantic_Northeast = ['NJ', 'NY', 'PA']
East_North_Central_Midwest = ['IL', 'IN', 'MI', 'OH', 'WI'] 
West_North_Central_Midwest = ['IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']
South_Atlantic_South = ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV']
East_South_Central_South = ['AL', 'KY', 'MS', 'TN']
West_South_Central_South = ['AR', 'LA', 'OK', 'TX']
Mountain_West = ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY']
Pacific_West = ['AK', 'CA', 'HI', 'OR', 'WA']

# Add region
data['Region'] = ['New England Northeast' if x in New_England_Northeast else
                 'Mid Atlantic Northeast' if x in Mid_Atlantic_Northeast else
                 'East North Central Midwest' if x in East_North_Central_Midwest else
                 'West North Central Midwest' if x in West_North_Central_Midwest else
                 'South Atlantic South' if x in South_Atlantic_South else
                 'East South Central South' if x in East_South_Central_South else
                 'West South Central South' if x in West_South_Central_South else
                 'Mountain West' if x in Mountain_West else
                 'Pacific West' for x in data['state']]
data = data.drop(["ZIP"], axis=1)

In [61]:
# Add suburban or urban
data['USR'] = ['Urban' if x > 3000 else 'Rural' if x < 1000 else 'Suburban' for x in data['density']]

In [62]:
# Add day of the week
data['Weekday'] = data['date_key'].dt.dayofweek

## Store Missing Weather Data

Weather data is missing for entire dats (i.e. 8/5/2017-8/7/2017) or sometimes weather data is missing for a particular zipcode on a certain data (i.e. zip code 80216 has not weather data for the month of January). However, impact scores exist for this data. We will have to incorporate this data if we decide to use an autoregressive model. For now, we will store it in a separate file.

In [63]:
missingweather = data.loc[pd.isna(data["Date"])].sort_values("date_key")
data.to_hdf("missingweather.h5", key='df', mode='a')

## Delete redudant columns

In [65]:
complete = data.loc[pd.notna(data["Date"])]
redundant_col = ["Date", 'lat', 'lng']
complete = complete.drop(redundant_col, axis=1)

In [66]:
# Final dataframe
complete

Unnamed: 0,date_key,zip5,impact_score,grid_lat,grid_lon,Time,ForecastRange,x,y,5_Wave_Geopotential_Height_isobaric,...,Snow_mixing_ratio_hybrid,Snow_mixing_ratio_isobaric,Total_cloud_cover_isobaric,Vertical_velocity_geometric_isobaric,Ice_growth_rate_altitude_above_msl,density,state,Region,USR,Weekday
0,2017-01-01,2722,20.268081,41.5,-71.0,0.0,0.0,578.0,97.0,5493.184570,...,,,,,,0.000000,MA,New England Northeast,Rural,6
1,2017-01-01,2722,20.268081,41.5,-71.0,6.0,0.0,578.0,97.0,5514.240723,...,,,,,,0.000000,MA,New England Northeast,Rural,6
2,2017-01-01,2722,20.268081,41.5,-71.0,12.0,0.0,578.0,97.0,5544.382324,...,,,,,,0.000000,MA,New England Northeast,Rural,6
3,2017-01-01,2722,20.268081,41.5,-71.0,18.0,0.0,578.0,97.0,5575.850586,...,,,,,,0.000000,MA,New England Northeast,Rural,6
4,2017-01-02,2722,16.868994,41.5,-71.0,0.0,0.0,578.0,97.0,5614.513184,...,,,,,,0.000000,MA,New England Northeast,Rural,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390054,2019-12-30,98421,,47.5,-122.5,18.0,0.0,950.0,170.0,5661.496094,...,0.0,2.890000e-09,0.0,0.030320,0.0,208.237785,WA,Pacific West,Rural,0
390055,2019-12-31,98421,,47.5,-122.5,0.0,0.0,950.0,170.0,5667.814453,...,0.0,3.051257e-09,0.0,0.004692,0.0,208.237785,WA,Pacific West,Rural,1
390056,2019-12-31,98421,,47.5,-122.5,6.0,0.0,950.0,170.0,5667.208984,...,0.0,3.020000e-09,0.0,0.022435,0.0,208.237785,WA,Pacific West,Rural,1
390057,2019-12-31,98421,,47.5,-122.5,12.0,0.0,950.0,170.0,5655.993652,...,0.0,3.270000e-09,0.0,0.021333,0.0,208.237785,WA,Pacific West,Rural,1


In [67]:
### Save data as hdf5 file
complete.to_hdf("data.h5", key='df', mode='a')