## Load imports and datasets

In [45]:
# Runs all imports
import pandas as pd
import datetime
from datetime import timedelta
import numpy as np

# Displays all datasets' columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Log into Darwin SDK
#from amb_sdk.sdk import DarwinSdk
#s = DarwinSdk()
#s.set_url('https://amb-demo-api.sparkcognition.com/v1/')
#s.auth_login_user('ethan.freeburg@utexas.edu','AAn7WQfgqk')

## Load Main Dataset

In [46]:
# Creates a table from compressed csvs in a directory
import os

DIR = 'data_details'

os.chdir(DIR)

tables = []
for file in os.listdir():
    tables.append(pd.read_csv(file, compression='gzip'))

os.chdir('../')

raw = pd.concat(tables)
    
raw.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,MAGNITUDE,MAGNITUDE_TYPE,FLOOD_CAUSE,CATEGORY,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH,TOR_OTHER_WFO,TOR_OTHER_CZ_STATE,TOR_OTHER_CZ_FIPS,TOR_OTHER_CZ_NAME,BEGIN_RANGE,BEGIN_AZIMUTH,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,200802,22,1300,200802,22,2200,14216,79884,NEW HAMPSHIRE,33,2008,February,Heavy Snow,Z,12,EASTERN HILLSBOROUGH,BOX,22-FEB-08 13:00:00,EST-5,22-FEB-08 22:00:00,0,0,0,0,0.00K,0.00K,Public,,,,,,,,,,,,,,,,,,,,,,A noreaster moved up the coast southeast of Ca...,,CSV
1,200804,1,352,200804,1,352,15549,88334,NEW HAMPSHIRE,33,2008,April,High Wind,Z,12,EASTERN HILLSBOROUGH,BOX,01-APR-08 03:52:00,EST-5,01-APR-08 03:52:00,0,0,0,0,0.00K,0.00K,Amateur Radio,52.0,MG,,,,,,,,,,,,,,,,,,,,Strong southwest flow behind a warm front allo...,An amateur radio operator recorded a wind gust...,CSV
2,200803,1,0,200803,1,1320,14773,83820,NEW HAMPSHIRE,33,2008,March,Heavy Snow,Z,12,EASTERN HILLSBOROUGH,BOX,01-MAR-08 00:00:00,EST-5,01-MAR-08 13:20:00,0,0,0,0,0.00K,0.00K,Trained Spotter,,,,,,,,,,,,,,,,,,,,,,Low pressure tracked from the Great Lakes acro...,,CSV
3,200801,14,500,200801,14,1700,13559,75727,NEW HAMPSHIRE,33,2008,January,Heavy Snow,Z,12,EASTERN HILLSBOROUGH,BOX,14-JAN-08 05:00:00,EST-5,14-JAN-08 17:00:00,0,0,0,0,10.00K,0.00K,Trained Spotter,,,,,,,,,,,,,,,,,,,,,,Low pressure moved up the Atlantic coast and s...,,CSV
4,200812,19,1353,200812,21,200,25148,146588,NEW HAMPSHIRE,33,2008,December,Heavy Snow,Z,12,EASTERN HILLSBOROUGH,BOX,19-DEC-08 13:53:00,EST-5,21-DEC-08 02:00:00,0,0,0,0,0.00K,0.00K,Trained Spotter,,,,,,,,,,,,,,,,,,,,,,An intensifying coastal low spread heavy snow ...,Six to eight inches of snow fell across easter...,CSV


## Reformat Data

In [47]:
data = raw.copy()

# --------------------- CLEANING ---------------------
# split and BEGIN_YEARMONTH + END_YEARMONTH columns into 2 columns each --> YEAR + MONTH
begin_year = data['BEGIN_YEARMONTH'].astype(str).str[:4]
begin_month = data['BEGIN_YEARMONTH'].astype(str).str[4:]
data['BEGIN_YEAR'] = begin_year
data['BEGIN_MONTH'] = begin_month
#data.drop(columns=['BEGIN_YEARMONTH'], inplace=True)

end_year = data['END_YEARMONTH'].astype(str).str[:4]
end_month = data['END_YEARMONTH'].astype(str).str[4:]
data['END_YEAR'] = end_year
data['END_MONTH'] = end_month
#data.drop(columns=['END_YEARMONTH'], inplace=True)

# add begin and end dates and times in YYYY-MM-DD HH:MM:SS format
begin = data['BEGIN_YEAR'].map(str) + '-' + data['BEGIN_MONTH'].map(str) + '-' + data['BEGIN_DAY'].map(str) + ' '
hour = data['BEGIN_TIME'].astype(str).str[:-2]
hour = hour.apply(lambda x: '{0:0>2}'.format(x))
minute = data['BEGIN_TIME'].astype(str).str[-2:]
minute = minute.apply(lambda x: '{0:0>2}'.format(x))
time = hour.map(str) + ':' + minute.map(str) + ':00'
data['BEGIN'] = begin+time

end = data['END_YEAR'].map(str) + '-' + data['END_MONTH'].map(str) + '-' + data['END_DAY'].map(str) + ' '
e_hour = data['END_TIME'].astype(str).str[:-2]
e_hour = e_hour.apply(lambda x: '{0:0>2}'.format(x))
e_minute = data['END_TIME'].astype(str).str[-2:]
e_minute = e_minute.apply(lambda x: '{0:0>2}'.format(x))
e_time = e_hour.map(str) + ':' + e_minute.map(str) + ':00'
data['END'] = end+e_time

# Explore Data Components

In [49]:
desc = data.describe(include='all')
desc

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,MAGNITUDE,MAGNITUDE_TYPE,FLOOD_CAUSE,CATEGORY,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH,TOR_OTHER_WFO,TOR_OTHER_CZ_STATE,TOR_OTHER_CZ_FIPS,TOR_OTHER_CZ_NAME,BEGIN_RANGE,BEGIN_AZIMUTH,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE,BEGIN_YEAR,BEGIN_MONTH,END_YEAR,END_MONTH,BEGIN,END
count,683982.0,683982.0,683982.0,683982.0,683982.0,683982.0,683982.0,683982.0,683982,683982.0,683982.0,683982,683982,683982,683982.0,683982,683982,683982,683982,683982,683982.0,683982.0,683982.0,683982.0,568908,565471,683982,365516.0,235847,69724,277.0,15134,15134.0,15134.0,1903,1903,1903.0,1903,433481.0,433481,433481,433481.0,433481,433481,433481.0,433481.0,433481.0,433481.0,683982,492890,683982,683982.0,683982.0,683982.0,683982.0,683982,683982
unique,,,,,,,,,68,,,12,56,3,,4181,123,354343,9,347741,,,,,1502,455,44,,4,7,,7,,,85,39,,731,,16,42528,,16,42878,,,,,102145,428398,1,11.0,12.0,11.0,12.0,354343,347741
top,,,,,,,,,TEXAS,,,June,Thunderstorm Wind,C,,WASHINGTON,LWX,01-JUL-12 00:00:00,CST-6,31-JUL-12 23:59:00,,,,,0.00K,0.00K,Trained Spotter,,EG,Heavy Rain,,EF0,,,JAN,AL,,MADISON,,N,SPRINGFIELD,,N,SPRINGFIELD,,,,,A strong upper-level disturbance passed throug...,Wind gusts were estimated at 60 mph.,CSV,2011.0,6.0,2011.0,6.0,2012-07-1 00:00:00,2012-07-31 23:59:00
freq,,,,,,,,,47599,,,104721,172886,412313,,5422,23416,1163,324875,1107,,,,,404084,551922,125634,,171868,62904,,7901,,,140,192,,22,,104554,483,,96475,465,,,,,294,684,683982,79091.0,104721.0,79091.0,104721.0,1163,1107
mean,201286.66182,14.923438,1276.620968,201286.661822,16.625633,1465.462794,72095.015774,432003.999851,,32.312742,2012.807284,,,,98.37402,,,,,,0.04332,0.016777,0.008594,0.0033,,,,33.951083,,,1.490975,,3.067298,182.755379,,,103.7299,,2.410599,,,2.444181,,,37.838124,-90.261514,37.835244,-90.254036,,,,,,,,,
std,315.112678,9.215241,680.820287,315.112676,9.207156,612.990317,34440.257262,208244.474812,,18.15424,3.151748,,,,122.789436,,,,,,2.577603,3.045148,0.277897,0.135616,,,,25.179454,,,1.048165,,4.246596,287.020573,,,83.1857,,4.446881,,,4.471382,,,5.022057,11.1269,5.022675,11.124569,,,,,,,,,
min,200801.0,1.0,0.0,200801.0,1.0,0.0,11748.0,64592.0,,1.0,2008.0,,,,1.0,,,,,,0.0,0.0,0.0,0.0,,,,0.0,,,1.0,,0.01,1.0,,,1.0,,0.0,,,0.0,,,-14.4,-170.9198,-14.456,-170.8689,,,,,,,,,
25%,201007.0,7.0,753.0,201007.0,9.0,1100.0,42904.0,251659.25,,19.0,2010.0,,,,25.0,,,,,,0.0,0.0,0.0,0.0,,,,1.25,,,1.0,,0.43,50.0,,,47.0,,1.0,,,1.0,,,34.6026,-97.14,34.6,-97.1377,,,,,,,,,
50%,201302.0,15.0,1452.0,201302.0,17.0,1600.0,71618.0,432052.5,,31.0,2013.0,,,,63.0,,,,,,0.0,0.0,0.0,0.0,,,,50.0,,,1.0,,1.5,75.0,,,93.0,,1.0,,,1.0,,,38.1253,-89.9377,38.1232,-89.92,,,,,,,,,
75%,201601.0,23.0,1802.0,201601.0,25.0,1900.0,102405.0,612699.75,,46.0,2016.0,,,,117.0,,,,,,0.0,0.0,0.0,0.0,,,,52.0,,,1.0,,4.0,200.0,,,139.0,,3.0,,,3.0,,,41.3307,-81.98,41.33,-81.98,,,,,,,,,


Upstream analysis suggets that several fields do not have sufficient information to perform data analysis upon. The following code block determines which do not meet a predefined threshold of missing values before recommending dropping.

In [50]:
row_count = len(data.index)

threshold = 0.5

analysis = pd.DataFrame(columns=['Field','Missing','Reccomendation'])

for a in desc.columns.values:
    missing = 1 - desc[a]['count'] / row_count
    rec = 'Keep'
    if missing > threshold:
        rec = 'Drop'
    
    analysis = analysis.append({'Field': a,'Missing': missing,'Reccomendation':rec},ignore_index=True)
    
            
analysis

Unnamed: 0,Field,Missing,Reccomendation
0,BEGIN_YEARMONTH,0.0,Keep
1,BEGIN_DAY,0.0,Keep
2,BEGIN_TIME,0.0,Keep
3,END_YEARMONTH,0.0,Keep
4,END_DAY,0.0,Keep
5,END_TIME,0.0,Keep
6,EPISODE_ID,0.0,Keep
7,EVENT_ID,0.0,Keep
8,STATE,0.0,Keep
9,STATE_FIPS,0.0,Keep


In [None]:
#drops
drop_list = ["BEGIN_YEARMONTH",
             "END_YEARMONTH",
             "EPISODE_ID",
             "EVENT_ID",
             "STATE_FIPS",
             "BEGIN_DAY",
             "BEGIN_TIME",
             "END_DAY",
             "END_TIME",
             "CATEGORY",
             "CZ_TYPE",
             "CZ_FIPS",             
             "CZ_NAME",
             "TOR_OTHER_WFO",
             "TOR_OTHER_CZ_NAME",
             "TOR_OTHER_CZ_STATE",
             "TOR_OTHER_CZ_FIPS",
             "WFO",
             "END_DATE_TIME",
             "CZ_TIMEZONE",
             "BEGIN_RANGE",
             "BEGIN_AZIMUTH",
             "BEGIN_YEAR",            
             "BEGIN_MONTH",
             "END_YEAR",
             "END_MONTH",
             "BEGIN",
             "END"            
            ]

for col_drop in drop_list:   
    data = data.drop(col_drop, axis = 1)
    

#change DAMAGE_PROPERTY and DAMAGE_CROPS type from #.##K to float
def convertToFloat(x):
    x = str(x)
    if(x[-1]=='K'):
        return(float(x[0:-1])*1000)
    elif(x[-1]=='M'):
        return(float(x[0:-1])*1000000)
    elif(x[-1]=='B'):
        return(float(x[0:-1])*1000000000)
    else:
        return(0.0)

    
#print(pd.value_counts(data.DAMAGE_PROPERTY))
#print(pd.value_counts(data.DAMAGE_CROPS))
data['DAMAGE_PROPERTY'].fillna(0.0, inplace=True)
data['DAMAGE_PROPERTY'] = data['DAMAGE_PROPERTY'].apply(convertToFloat)

data['DAMAGE_CROPS'].fillna(0.0, inplace=True)
data['DAMAGE_CROPS'] = data['DAMAGE_CROPS'].apply(convertToFloat)


#remove the row if no location information is provided
#note this removes 250501 rows, reducing our dataset by 37%
data = data[pd.notnull(data['BEGIN_LOCATION'])]
data = data[pd.notnull(data['END_LOCATION'])]



In [None]:
# --------------------- FEATURE ENGINEERING ---------------------
# add duration column
duration = pd.to_datetime(data['END']) - pd.to_datetime(data['BEGIN'])
                   
data['DURATION_seconds'] = ((duration.dt.total_seconds()))    

#separates hail size / wind speed MAGNITUDE
data['WIND_SPEED'] = np.where(data['MAGNITUDE_TYPE'].isin(['MG','EG']), data['MAGNITUDE'], np.NaN)
data['HAIL_SIZE'] = np.where(data['MAGNITUDE_TYPE'].isna(), data['MAGNITUDE'], np.NaN)

#divide the data into 12 distinct groups by event type:
pd.value_counts(data.EVENT_TYPE)
"""
Wind (COLE):
    Thunderstorm Wind
    High Wind
    Marine Thunderstorm Wind
    Marine High Wind
    Strong Wind
    Marine Strong Wind

Winter Weather (ETHAN):
    Winter Weather
    Winter Storm
    Heavy Snow
    Blizzard
    Frost/Freeze
    Ice Storm
    Sleet
    Lake-Effect Snow

Rain (FREYA):
    Heavy Rain

Hail (RUOCHEN):
    Hail
    Marine Hail

Flood (FREYA):
    Flash Flood
    Flood
    Coastal Flood
    Lakeshore Flood

Drought (ETHAN):
    Drought

Tornado (COLE):
    Tornado

Heat (ETHAN):
    Heat
    Excessive Heat

Cold (COLE):
    Cold/Wind Chill
    Extreme Cold/Wind Chill

Lightning (FREYA):
    Lightning

Wildfire (RUOCHEN):
    Wildfire

Tides/Currents (RUOCHEN):
    High Surf
    Rip Current
    Astronomical Low Tide
    Storm Surge/Tide
"""
data.columns
data.head(50)

In [11]:
raw.columns.values

array(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE',
       'STATE_FIPS', 'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE',
       'CZ_FIPS', 'CZ_NAME', 'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE',
       'END_DATE_TIME', 'INJURIES_DIRECT', 'INJURIES_INDIRECT',
       'DEATHS_DIRECT', 'DEATHS_INDIRECT', 'DAMAGE_PROPERTY',
       'DAMAGE_CROPS', 'SOURCE', 'MAGNITUDE', 'MAGNITUDE_TYPE',
       'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE', 'TOR_LENGTH',
       'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'DATA_SOURCE'],
      dtype=object)