# Weather Underground Hurricane Data

-----

## Initial Exploration

A notebook for exploring the hurricane data acquired from the ```src/get_data.py``` script.

In [None]:
# Imports
import os
import pickle
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

In [None]:
# Load in the data
raw_data_dir = "../data/raw"

fnames = dict([(x.split('.')[0], os.path.join(raw_data_dir, x)) 
               for x in os.listdir(raw_data_dir) if x.endswith('.pkl')])

raw_data = dict()
for k, v in fnames.items():
    with open(v, 'rb') as fin:
        raw_data[k] = pickle.load(fin)

In [None]:
# - Region Data Processing
def convert_region_data_helper(x):
    """Helper function to convert region data"""
    if x == '':
        return np.nan
    else:
        return float(x.replace(',', ''))

def convert_region_data(data_dict):
    """Function to convert raw Region data"""
    ret = dict()
    for k, v in data_dict.items():
        ret[k] = convert_region_data_helper(v)
    return ret
    
region_df = None
for region, region_data in raw_data['region_data'].items():
    conv_data = dict()
    for k, v in region_data.items():
        conv_data[k] = convert_region_data(v)
    t_region_df = pd.DataFrame(conv_data).T
    for col in t_region_df.columns:
        t_region_df[col] = t_region_df[col].astype(float)
    t_region_df.columns = pd.MultiIndex.from_product([[region], t_region_df.columns], 
                                                     names=['Region', 'Statistic'])
    if region_df is None:
        region_df = t_region_df.copy()
    else:
        region_df = pd.concat([region_df, t_region_df], axis=1)

region_df.sort_index(inplace=True)

In [None]:
# - Simple plots
def plot_helper(data, ax, title=None, ylabel=None):
    """Helper function to plot data"""
    data.plot(ax=ax)    
    if title is not None:
        ax.set_title(title)
    if ylabel is not None:
        ax.set_ylabel(ylabel)

hurricanes = region_df.xs('Hurricanes', axis=1, level='Statistic')
damages = region_df.xs('Damage', axis=1, level='Statistic')
deaths = region_df.xs('Deaths', axis=1, level='Statistic')
    
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(12, 12))   
plot_helper(hurricanes, ax1, ylabel='# of Hurricanes')
plot_helper(damages, ax2, ylabel='Damage (Millions USD)')
plot_helper(deaths, ax3, ylabel='Deaths')

ax1.set_title('Regional Hurricane Data by Year')
ax3.set_xlabel('Date');

In [None]:
# - Region-Year Data Processing
def process_region_year_data_helper(year, k, v):
    """Helper function to process region-year data"""
    numeric_flds = ['Max Winds', 'Min Pressure', 'Deaths', 'Damage']
    if k == 'Storm':
        return v
    elif k == 'Dates':
        spl_dts = v.replace('-', ' ').split(' ')
        ret = list()
        for dt in spl_dts:
            t_dt = dt.strip()
            if t_dt == '' or t_dt.startswith('999'):
                continue
            
            sub_spl_dts = t_dt.split('/')
            sub_spl_dts[0] = np.mod(int(sub_spl_dts[0]), 13)
            if sub_spl_dts[0] == 0:
                sub_spl_dts[0] = 1
            
            str_dts = ["/{}".format(x) for x in sub_spl_dts]
            
            str_dt = str(year)+''.join(str_dts)
            try:
                t_dtime = datetime.strptime(str_dt, '%Y/%m/%d')
            except ValueError as verr:
                str_dt = "{}/{}/1".format(year, sub_spl_dts[0]+1)
                t_dtime = datetime.strptime(str_dt, '%Y/%m/%d') - timedelta(days=1)
                
            ret.append(t_dtime)
                
        return ret
    elif k in numeric_flds:
        if v == '' or v == 'Unknown':
            return np.nan
        elif v == 'Minimal':
            return 0.
        elif v == 'Millions':
            return 1000000.
        else:
            strip_chars = [',', '>', '<', '+']
            t_v = v
            for x in strip_chars:
                t_v = t_v.replace(x, '')
            return float(t_v)
    else:
        return v

def process_region_year_data(region, year, storm_id, storm_data):
    """Processes year-data to get storms"""
    unq_id = ''.join([x[0].upper() for x in region.split(' ')])
    unq_id += '{}{:02}'.format(year, storm_id)
    
    ret_data = dict()
    ret_data['Region'] = region
    for k, v in storm_data.items():
        try:
            ret_data[k] = process_region_year_data_helper(year, k, v)
        except Exception as ex:
            print(k, v)
            raise ex
    
    return unq_id, ret_data

storm_data = dict()
for region, year_data in raw_data['region_year_data'].items():
    for yr, data in year_data.items():
        for storm_id, storm_data in data.items():
            t_id, t_data = process_region_year_data(region, yr, storm_id, storm_data)
            storm_data[t_id] = t_data