### Import

In [79]:
import os
import sys
import numpy as np
import pandas as pd
import shapefile 
import matplotlib.pyplot as plt

### Load Shapefile

In [98]:
geostates = shapefile.Reader('../../../data/tl_2017_us_state/tl_2017_us_state_lite.shp')

### Find biggest part and get geocentroid

In [122]:
geoStates = {}
stateNames = []
for s,sr in enumerate(geostates.shapeRecords()):
    record = sr.record
    stateName = record[6]
    stateNames.append(stateName)
    latitude = record[-2]
    longitude = record[-1]
    latitude = latitude.replace('+','')
    longitude = longitude.replace('+','')
    latitude = float(latitude.replace('-0','-'))
    longitude = float(longitude.replace('-0','-'))
    geoStates[stateName] = (latitude,longitude)

### Load and Combine Flu Data

In [132]:
myfiles = []
data = []
for root, dirs, files in os.walk("../../../data/", topdown=False):
    for name in files:
        if ('.csv' in name and 'StateData' in name):
            myfiles.append(os.path.join(root, name))
for myfile in myfiles:
    subset = pd.read_csv(myfile,delimiter=',')
    data.append(subset)
data = pd.concat(data,axis=0)
data = data[['STATENAME','ACTIVITY LEVEL','WEEKEND','WEEK','SEASON']]
data.columns = ['state','level','date','week','season']

### Date Transform 1

In [133]:
data['year'] = data['date'].str.slice(7)
data['level'] = data['level'].str.slice(6)
data['month'] = data['date'].str.slice(0,3)

### Remove duplicates and obsolete states

In [135]:
data = data.drop_duplicates(subset=None, keep='first', inplace=False)
data = data[data['state'] != 'Commonwealth of the Northern Mariana Islands']
data = data[data['state'] != 'Alaska']
data = data[data['state'] != 'Hawaii']
data = data[data['state'] != 'Virgin Islands']
data = data[data['state'] != 'Puerto Rico']
data = data[data['state'] != 'New York City']

### Add Centroid to DataFrame

In [136]:
latitudes = []
longitudes = []
states = data['state'].values
for state in states:
    centroid = geoStates[state]
    latitudes.append(centroid[1])
    longitudes.append(centroid[0])
data['latitude'] = latitudes
data['longitude'] = longitudes

### Transform Date and get date

In [137]:
data['datetime'] = pd.to_datetime(data['date'],infer_datetime_format=True)
stateNames = np.unique(data['state'].values)

### Create Timecourses for each state

In [139]:
for state in stateNames:
    subset = data[data['state'] == state]
    subset = subset.sort_values(by='datetime')
    subset.to_csv('../../../data/statebased/' + state + '.csv',index=False)