In [1]:
import numpy as np
import os
import pandas as pd
import glob
import h5py
import tables
from ast import literal_eval

## Merging Data

This script takes raw data, create a pandas df and saves it to disk as a hdf5 file called `data.h5`. Before running this code, make sure your file directory looks like this...

```
.
│   Data Merged.ipynb
│   ...    
│
└───Data
    │   search_grid.csv
    │   impact_score_2017.csv
    │   impact_score_2017.csv
    |   impact_score_2019_public_test.csv
    |
    └───GFS_2017
    |   │   gfs_4_20170101_0000_000.csv
    |   │   ...
    |
    └───GFS_2018
    |   │   gfs_4_20180101_0000_000.csv
    |   │   ...     
    | 
    └───GFS_2019
        │   gfs_4_20190101_0000_000.csv
        │   ...     
   

```

# Loading Provided Data

In [2]:
#### Path to all data files

# All weather
weather17 = ["./Data/GFS_2017/" + f for f in os.listdir("./Data/GFS_2017")]
weather18 = ["./Data/GFS_2018/" + f for f in os.listdir("./Data/GFS_2018")]
weather19 = ["./Data/GFS_2019/" + f for f in os.listdir("./Data/GFS_2019")]
allweather = weather17 + weather18 + weather19

# All impact
allimpact = ["./Data/" + f for f in os.listdir("./Data") if f.startswith('impact')]

# Loc2zip map
grid = "./Data/search_grid.csv"

In [3]:
### Load weather into 1 df
# Concat into a giant dataframe
weather = pd.concat(map(pd.read_csv, allweather), ignore_index=True)

In [4]:
### Load and flatten loc2zip map
loc2zip = pd.read_csv(grid, converters={"mapped_zipcodes": literal_eval})
loc2zip = loc2zip.explode("mapped_zipcodes").reset_index()
loc2zip['mapped_zipcodes'] = pd.to_numeric(loc2zip['mapped_zipcodes'])
#loc2zip.rename(columns={"mapped_zipcodes":"zip5"}, inplace=True)

In [5]:
### Load impact into 1 df
impact = pd.concat(map(pd.read_csv, allimpact), ignore_index=True).drop(["Unnamed: 0"], axis=1)

In [6]:
### Join impact and loc2zip
impact_loc = impact.join(loc2zip.set_index('mapped_zipcodes'), on='zip5').drop(["index"], axis = 1)

In [7]:
### Join impact and weather in loc and date
# Preprocessing on join columns 
impact_loc["date_key"] = pd.to_datetime(impact_loc["date_key"])
weather["Date"] = pd.to_datetime(weather["Date"], format="%Y%m%d")

data = impact_loc.merge(weather, left_on=["date_key", "grid_lat", "grid_lon"], right_on=["Date", "lat", "lng"])
data = data.sort_values(by="Date")

## Adding Features from External Data Scources

In [8]:
### Add state, region, and urban/suburban information from FAR codes dataset
data = data.sort_values(["zip5", 'date_key', 'Time'])
x = data['zip5']
# Read in Far codes
far = pd.read_excel('FARcodesZIPdata2010WithAKandHI.xlsx', sheet_name = 1)
cols = ['ZIP', 'density', 'state']
far = far[cols]
new_row = {'ZIP': 2722, 'density': 0, 'state': 'MA'}
far = far.append(new_row, ignore_index=True)

In [9]:
# Add density and state
data = data.merge(far, left_on = 'zip5', right_on = 'ZIP')
y = data['zip5']

In [10]:
# State to region dictionary
New_England_Northeast = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT']
Mid_Atlantic_Northeast = ['NJ', 'NY', 'PA']
East_North_Central_Midwest = ['IL', 'IN', 'MI', 'OH', 'WI'] 
West_North_Central_Midwest = ['IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']
South_Atlantic_South = ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV']
East_South_Central_South = ['AL', 'KY', 'MS', 'TN']
West_South_Central_South = ['AR', 'LA', 'OK', 'TX']
Mountain_West = ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY']
Pacific_West = ['AK', 'CA', 'HI', 'OR', 'WA']

# Add region
data['Region'] = ['New England Northeast' if x in New_England_Northeast else
                 'Mid Atlantic Northeast' if x in Mid_Atlantic_Northeast else
                 'East North Central Midwest' if x in East_North_Central_Midwest else
                 'West North Central Midwest' if x in West_North_Central_Midwest else
                 'South Atlantic South' if x in South_Atlantic_South else
                 'East South Central South' if x in East_South_Central_South else
                 'West South Central South' if x in West_South_Central_South else
                 'Mountain West' if x in Mountain_West else
                 'Pacific West' for x in data['state']]
data = data.drop(["ZIP"], axis=1)

In [11]:
# Add suburban or urban
data['USR'] = ['Urban' if x > 3000 else 'Rural' if x < 1000 else 'Suburban' for x in data['density']]

In [12]:
# Add day of the week
data['Weekday'] = data['date_key'].dt.dayofweek

## Delete redudant columns

In [None]:
redundant_col = ["Date", 'lat', 'lng']


In [13]:
# Final dataframe
data

Unnamed: 0,date_key,zip5,impact_score,grid_lat,grid_lon,Date,Time,ForecastRange,x,y,...,Snow_mixing_ratio_hybrid,Snow_mixing_ratio_isobaric,Total_cloud_cover_isobaric,Vertical_velocity_geometric_isobaric,Ice_growth_rate_altitude_above_msl,density,state,Region,USR,Weekday
0,2017-01-01,2722,20.268081,41.5,-71.0,2017-01-01,0,0,578,97,...,,,,,,0.000000,MA,New England Northeast,Rural,6
1,2017-01-01,2722,20.268081,41.5,-71.0,2017-01-01,6,0,578,97,...,,,,,,0.000000,MA,New England Northeast,Rural,6
2,2017-01-01,2722,20.268081,41.5,-71.0,2017-01-01,12,0,578,97,...,,,,,,0.000000,MA,New England Northeast,Rural,6
3,2017-01-01,2722,20.268081,41.5,-71.0,2017-01-01,18,0,578,97,...,,,,,,0.000000,MA,New England Northeast,Rural,6
4,2017-01-02,2722,16.868994,41.5,-71.0,2017-01-02,0,0,578,97,...,,,,,,0.000000,MA,New England Northeast,Rural,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389326,2019-12-30,98421,,47.5,-122.5,2019-12-30,18,0,950,170,...,0.0,2.890000e-09,0.0,0.030320,0.0,208.237785,WA,Pacific West,Rural,0
389327,2019-12-31,98421,,47.5,-122.5,2019-12-31,0,0,950,170,...,0.0,3.051257e-09,0.0,0.004692,0.0,208.237785,WA,Pacific West,Rural,1
389328,2019-12-31,98421,,47.5,-122.5,2019-12-31,6,0,950,170,...,0.0,3.020000e-09,0.0,0.022435,0.0,208.237785,WA,Pacific West,Rural,1
389329,2019-12-31,98421,,47.5,-122.5,2019-12-31,12,0,950,170,...,0.0,3.270000e-09,0.0,0.021333,0.0,208.237785,WA,Pacific West,Rural,1


In [14]:
### Save data as hdf5 file
data.to_hdf("data.h5", key='df', mode='a')