### __Finding the Most Important Factors in Weather Events using Descision Trees__

Uses decision trees to find the most important factors in determining the root node (variable 'root\_node') looking at both categorical and numeric values.<br>
To extract the image file, use the following in terminal: $dot -Tpng tree_all.dot -o tree_all.png

First Section: read in and clean the data<br>
Second Section: clean property and crop damage as well as month data<br>
Third Section: creates the decision tree algorithm<br>
Fourth Section: creates the decision tree visual with removed values<br>
Fifth Section: creates the decision tree visual with values<br>

Note: only the fourth _or_ fifth section should be run, as the fifth overwrites the fourth

In [14]:
import pandas as pd
from sklearn import tree
import calendar

# the node to be analyzed
root_node = 'EVENT_TYPE'
# what the depth of the leaf nodes will be
maxdepth = 3
# percentage of data to use
data_percentage = 0.1 # <0.2 recomended
# file name for the output ".dot" file
output_file = "tree_all_"+root_node+".dot"

# preparing and cleaning data
df = pd.read_csv("Data/StormEvents_details-ftp_v1.0_d2019_c20200416.csv")
df = pd.concat([df, pd.read_csv("Data/StormEvents_details-ftp_v1.0_d2018_c20200317.csv")])
df = pd.concat([df, pd.read_csv("Data/StormEvents_details-ftp_v1.0_d2017_c20200121.csv")])
df = pd.concat([df, pd.read_csv("Data/StormEvents_details-ftp_v1.0_d2016_c20190817.csv")])
df = pd.concat([df, pd.read_csv("Data/StormEvents_details-ftp_v1.0_d2015_c20191116.csv")])

# dropping columns that do not contain data or only apply to some events
# magnitude was dropped because it's hail size for some measurements, and wind speed for others
df = df.drop(columns = ['BEGIN_DATE_TIME','END_DATE_TIME','MAGNITUDE','MAGNITUDE_TYPE',
                        'FLOOD_CAUSE','CATEGORY','TOR_F_SCALE','TOR_LENGTH','TOR_WIDTH',
                        'TOR_OTHER_WFO','TOR_OTHER_CZ_STATE','TOR_OTHER_CZ_FIPS',
                        'TOR_OTHER_CZ_NAME','EPISODE_NARRATIVE','EVENT_NARRATIVE','DATA_SOURCE'])
    
# remove outside the continental US - currently disabled
#df = df.drop(df[(df.STATE == 'HAWAII') | (df.STATE == 'ALASKA') | (df.STATE == 'E PACIFIC') | (df.STATE == 'ATLANTIC NORTH') | (df.STATE == 'ATLANTIC SOUTH') | (df.STATE == 'GULF OF MEXICO') | (df.STATE == 'HAWAII WATERS') | (df.STATE == 'PUERTO RICO') | (df.STATE == 'VIRGIN ISLANDS') | (df.STATE == 'AMERICAN SAMOA')].index)

# calculate event's distance covered
df['DISTANCE'] = ((df["BEGIN_LAT"]-df["END_LAT"])**2 + (df["BEGIN_LON"]-df["END_LON"])**2)**(1/2)

# calculate event mid points, and remove the start & end locations (might want to add those back in...?)
df["MID_LAT"] = (df["BEGIN_LAT"]+df["END_LAT"]) / 2
df["MID_LON"] = (df["BEGIN_LON"]+df["END_LON"]) / 2
df = df.drop(columns=["BEGIN_LAT","END_LAT","BEGIN_LON","END_LON"])

# remove any rows with missing data, then reset the index
df = df.dropna().reset_index(drop=True)


df = df.loc[:round(len(df)*data_percentage),:] ### SHRINKS DATA DOWN - TEMPORARY

In [15]:
# convert the non-numeric values (i.e. 10.00k) to numeric (10000)
def unit_converter(df, *argv):
    for column in argv:
        for i in range(len(df)):
            num = df.loc[i,column][:-1] # grab the number
            suffix = df.loc[i,column][-1] # grab the suffix (ie K, M, etc)
            if (suffix == 'K'): # for thousands
                num = float(num) * 1e3
            elif (suffix == 'M'): # for millions
                num = float(num) * 1e6
            elif (suffix == 'B'): # for billions
                num = float(num) * 1e9
            elif (suffix == 'T'): # for trillions - not sure if this is necessary, billions is though
                num = float(num) * 1e12
            else:
                raise ValueError(num,suffix)
            df.loc[i,column] = num
    return df

df = unit_converter(df,'DAMAGE_PROPERTY','DAMAGE_CROPS')

# convert months to numerical representation (Jan = 1, Feb = 2, etc.)
mo_to_num = {name: num for num, name in enumerate(calendar.month_name) if num}
for i in range(len(df)):
    df.loc[i,'MONTH_NAME'] = mo_to_num[df.loc[i,'MONTH_NAME']]

In [16]:
# separate the root from the decision 'leaves'
X = df.drop(columns=root_node)
y = df[root_node]

# create the categorical data
X = pd.get_dummies(X,sparse=True)
y = pd.get_dummies(y)
# create the decision tree
clf = tree.DecisionTreeClassifier(max_depth=maxdepth,random_state=0).fit(X, y)

In [17]:
# create the tree visual w/o values
dotstr = tree.export_graphviz(clf, feature_names=X.columns)

# remove the long list of values in the visual
dotstr = dotstr.split('\\nvalue')
string = dotstr[0]
for i in range(1,len(dotstr)):
    section = dotstr[i].split('"')
    string = string + '"' + '"'.join(section[1:])
with open("Tree Visuals/"+output_file, "w") as file:
    file.write(string)

In [17]:
# create the tree visual w/ values
with open("Tree Visuals/"+output_file, "w") as outfile:
    tree.export_graphviz(clf, out_file=outfile, feature_names=X.columns)