In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from helpers import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
#merging all the datasets into one
mergestorm_df = pd.concat( 
    map(pd.read_csv, [
                      "resources/stormevents_2022.csv", 
                      "resources/stormevents_2021.csv",
                      "resources/stormevents_2020.csv",
                      "resources/stormevents_2019.csv",
                      "resources/stormevents_2018.csv", 
                      "resources/stormevents_2017.csv", 
                      "resources/stormevents_2016.csv", 
                      "resources/stormevents_2015.csv", 
                      "resources/stormevents_2014.csv", 
                      "resources/stormevents_2013.csv", 
                      "resources/stormevents_2012.csv", 
                      "resources/stormevents_2011.csv", 
                      "resources/stormevents_2010.csv", 
                      "resources/stormevents_2009.csv", 
                      "resources/stormevents_2008.csv"]), ignore_index=True) 
mergestorm_df.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,202202,20,2118,202202,20,2218,165464,999902,NEVADA,32,...,,,,,,,,Strong winds increased ahead of an approaching...,"Station (UP994) 3.1 SE West Wendover, Elevatio...",CSV
1,202202,21,800,202202,22,1000,165465,999903,NEVADA,32,...,,,,,,,,A low centered over northern and central Nevad...,Thirteen inches fell at station (BCSN2) Big Cr...,CSV
2,202202,22,200,202202,22,900,165465,999904,NEVADA,32,...,,,,,,,,A low centered over northern and central Nevad...,Fifteen inches fell at station (TJMN2) Toe Jam...,CSV
3,202202,18,1609,202202,18,1609,165611,1001181,ATLANTIC SOUTH,87,...,7.0,SE,PONTE VEDRA,30.05,-81.17,30.05,-81.17,Pre-frontal showers and thunderstorms moved so...,A brief waterspout was observed offshore of So...,CSV
4,202202,2,0,202202,3,0,165668,1001527,AMERICAN SAMOA,97,...,5.0,NNW,VAITOGI,-14.333,-170.7157,-14.3393,-170.7268,A surface trough over the Islands held the po...,"Over a 24-hour period, WSO Pago Pago recorded ...",CSV


In [3]:
#merging all the datasets into one for earliest available data years 
mergestorm1950_1964_df = pd.concat( 
    map(pd.read_csv, ["resources/stormevents_1950.csv", 
                      "resources/stormevents_1951.csv", 
                      "resources/stormevents_1952.csv", 
                      "resources/stormevents_1953.csv", 
                      "resources/stormevents_1954.csv", 
                      "resources/stormevents_1955.csv",
                      "resources/stormevents_1956.csv", 
                      "resources/stormevents_1957.csv", 
                      "resources/stormevents_1958.csv",
                      "resources/stormevents_1959.csv", 
                      "resources/stormevents_1960.csv", 
                      "resources/stormevents_1961.csv",
                      "resources/stormevents_1962.csv", 
                      "resources/stormevents_1963.csv", 
                      "resources/stormevents_1964.csv"]),
                      ignore_index=True)
mergestorm1950_1964_df.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,195004,28,1445,195004,28,1445,,10096222,OKLAHOMA,40,...,0,,,35.12,-99.2,35.17,-99.2,,,PUB
1,195004,29,1530,195004,29,1530,,10120412,TEXAS,48,...,0,,,31.9,-98.6,31.73,-98.6,,,PUB
2,195007,5,1800,195007,5,1800,,10104927,PENNSYLVANIA,42,...,0,,,40.58,-75.7,40.65,-75.47,,,PUB
3,195007,5,1830,195007,5,1830,,10104928,PENNSYLVANIA,42,...,0,,,40.6,-76.75,,,,,PUB
4,195007,24,1440,195007,24,1440,,10104929,PENNSYLVANIA,42,...,0,,,41.63,-79.68,,,,,PUB


In [4]:
def clean_storm_df(dataFrame, narrow_cols, only_impactful):
    narrow_df = dataFrame[narrow_cols] # reduce columns to relevant ones

    # translate tornado F scale into simple integer scale
    narrow_df['TOR_F_SCALE'] = narrow_df['TOR_F_SCALE'].apply(retype_tornado_scale)

    # fill missing values for damage columns
    narrow_df["DAMAGE_PROPERTY"] = narrow_df["DAMAGE_PROPERTY"].fillna("0.00K")
    narrow_df["DAMAGE_CROPS"] = narrow_df["DAMAGE_CROPS"].fillna("0.00K")

    # re-type damage values to float to support math operations
    narrow_df['DAMAGE_PROPERTY'] = narrow_df['DAMAGE_PROPERTY'].apply(retype_damage_value)
    narrow_df['DAMAGE_CROPS'] = narrow_df['DAMAGE_CROPS'].apply(retype_damage_value)

    # merge deaths/injuries/damages columns
    narrow_df["TOTAL DEATHS"] = narrow_df["DEATHS_DIRECT"] + narrow_df["DEATHS_INDIRECT"]
    narrow_df["TOTAL INJURIES"] = narrow_df["INJURIES_DIRECT"] + narrow_df["INJURIES_INDIRECT"]
    narrow_df["TOTAL DAMAGES"] = narrow_df["DAMAGE_PROPERTY"] + narrow_df["DAMAGE_CROPS"]

    # remove now extraneous columns; damages columns remain as we perform individual analyses
    narrow_df = narrow_df.drop(columns=["INJURIES_DIRECT", "INJURIES_INDIRECT","DEATHS_DIRECT", "DEATHS_INDIRECT"])

    # Narrow down events that have had at least one death and/or at least one injury 
    if only_impactful:
        # still leaves in rows with 0 damages values
        narrow_df = narrow_df.loc[(narrow_df["TOTAL DEATHS"] > 0) | (narrow_df["TOTAL INJURIES"] > 0)]

    # reset the index 
    narrow_df.reset_index(drop=True, inplace=True)

    return narrow_df

In [5]:
narrow_cols = ["YEAR", 
                "STATE", 
                "EVENT_TYPE",  
                "INJURIES_DIRECT", 
                "INJURIES_INDIRECT",
                "DEATHS_DIRECT", 
                "DEATHS_INDIRECT",
                "DAMAGE_PROPERTY",
                "DAMAGE_CROPS", 
                "SOURCE", 
                "MAGNITUDE", 
                "MAGNITUDE_TYPE", 
                "TOR_F_SCALE", 
                "TOR_LENGTH",
                "TOR_WIDTH",
                "TOR_OTHER_CZ_STATE",
                "BEGIN_LOCATION", 
                "BEGIN_LAT", 
                "BEGIN_LON", 
                "END_LAT", 
                "END_LON", 
                "EPISODE_ID", 
                "EVENT_ID",]

cleaned_2000s_df = clean_storm_df(mergestorm_df, narrow_cols, True)
cleaned_1950s_df = clean_storm_df(mergestorm1950_1964_df, narrow_cols, True)
cleaned_1950s_df.head()

Unnamed: 0,YEAR,STATE,EVENT_TYPE,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,MAGNITUDE,MAGNITUDE_TYPE,TOR_F_SCALE,TOR_LENGTH,...,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_ID,EVENT_ID,TOTAL DEATHS,TOTAL INJURIES,TOTAL DAMAGES
0,1950,PENNSYLVANIA,Tornado,25000.0,0.0,,0.0,,2.0,12.9,...,,40.58,-75.7,40.65,-75.47,,10104927,0,2,25000.0
1,1950,PENNSYLVANIA,Tornado,250000.0,0.0,,0.0,,3.0,4.7,...,,40.2,-76.12,40.27,-76.07,,10104931,0,1,250000.0
2,1950,OKLAHOMA,Tornado,250000.0,0.0,,0.0,,2.0,6.8,...,,35.0,-96.25,35.07,-96.17,,10099490,0,6,250000.0
3,1950,OKLAHOMA,Tornado,250000.0,0.0,,0.0,,3.0,9.4,...,,34.88,-99.28,35.0,-99.2,,10096220,1,1,250000.0
4,1950,OKLAHOMA,Tornado,250000.0,0.0,,0.0,,4.0,4.5,...,,35.08,-96.4,35.13,-96.35,,10096223,5,32,250000.0


In [6]:
other_2000s_df = clean_storm_df(mergestorm_df, narrow_cols, False)
other_1950s_df = clean_storm_df(mergestorm1950_1964_df, narrow_cols, False)

other_counts_1950s = other_1950s_df['YEAR'].value_counts()
other_counts_2000s = other_2000s_df['YEAR'].value_counts()
print(other_counts_2000s)

YEAR
2011    79091
2008    71190
2022    69670
2019    67861
2012    64503
2010    62807
2018    62697
2021    61389
2020    61279
2013    59986
2014    59475
2015    57906
2009    57398
2017    57029
2016    56005
Name: count, dtype: int64


In [7]:
# create a summary dataframe with basic stats
def get_summary_df(to_summarize):
    s = to_summarize.sum()
    mean = to_summarize.mean()
    median = to_summarize.median()
    mode = to_summarize.mode()
    var = np.var(to_summarize, ddof=0)
    dev = np.std(to_summarize, ddof=0)
    df = pd.DataFrame({
        "Total Elements": s,
        "Mean": mean,
        "Median": median,
        "Mode": mode,
        "Variance": var,
        "Std Deviation": dev
    }, index=[0])
    return df

# number of storms per year summary data
summary_1950s = get_summary_df(other_counts_1950s)
summary_2000s = get_summary_df(other_counts_2000s)
print("All Storms")
print(summary_1950s)
print(summary_2000s)

All Storms
   Total Elements         Mean  Median  Mode       Variance  Std Deviation
0           22087  1472.466667  1813.0   223  670573.982222     818.885818
   Total Elements          Mean   Median   Mode      Variance  Std Deviation
0          948286  63219.066667  61389.0  56005  3.765093e+07     6136.03512


In [8]:
# realize that the old data is much less extensive than new
type_counts_1950s = other_1950s_df['EVENT_TYPE'].value_counts()
type_counts_2000s = other_2000s_df['EVENT_TYPE'].value_counts()
print(f"{len(type_counts_1950s)} type(s) of storm event from 1950-1964")
print(type_counts_1950s)
print()
print(f"{len(type_counts_2000s)} type(s) of storm event from 2008-2022")

3 type(s) of storm event from 1950-1964
EVENT_TYPE
Tornado              8575
Thunderstorm Wind    7515
Hail                 5997
Name: count, dtype: int64

55 type(s) of storm event from 2008-2022


In [9]:
tornado_counts_1950s = other_1950s_df['TOR_F_SCALE'].value_counts()
# exclude unknown values, which pop up in the more recent data
tornado_counts_2000s = other_2000s_df[other_2000s_df['TOR_F_SCALE'] != -1.0]['TOR_F_SCALE'].value_counts()

print("Tornados")
print(get_summary_df(tornado_counts_1950s))
print(get_summary_df(tornado_counts_2000s))

Tornados
   Total Elements         Mean  Median  Mode      Variance  Std Deviation
0            7748  1291.333333  1063.5    36  1.119362e+06    1057.998845
   Total Elements         Mean  Median  Mode      Variance  Std Deviation
0           20315  3385.833333  1268.5    13  1.592090e+07    3990.100643
