In [None]:
import gzip
import csv
import json
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from io import TextIOWrapper
from sklearn.linear_model import LinearRegression
import geopandas
import numpy as np
from IPython.core.display import HTML
from matplotlib.colors import ListedColormap
from matplotlib.animation import FuncAnimation
from shapely.geometry import Point
%matplotlib inline 

In [None]:
def convert_K(s): # from CS 301 P5 --> converting string damage amount to int
    s = s.replace("K", "")
    x = int(s.find("."))
    y = s[x:]
    length_after_period = len(y) - 1
    corrected_K = s + ("0"*(3-length_after_period))
    corrected_K_final = corrected_K.replace(".", "")
    return int(corrected_K_final)
    
def convert_M(s):
    s = s.replace("M", "")
    x = int(s.find("."))
    y = s[x:]
    length_after_period = len(y) - 1
    corrected_M = s + ("0"*(6-length_after_period))
    corrected_M_final = corrected_M.replace(".", "")
    return int(corrected_M_final)

def convert_B(s):
    s = s.replace("B", "")
    x = int(s.find("."))
    y = s[x:]
    length_after_period = len(y) - 1
    corrected_B = s + ("0"*(9-length_after_period))
    corrected_B_final = corrected_B.replace(".", "")
    return int(corrected_B_final)

def convert(s):
    if int(s.find("K")) > 0:
        return convert_K(s)
    elif int(s.find("M")) > 0:
        return convert_M(s)
    elif int(s.find("B")) > 0:
        return convert_B(s)
    else:
        return None

In [None]:
df_list = []
with ZipFile("StormEvents.zip") as zf:
    for file in zf.namelist():
        with zf.open(file) as f:
            curr_data = pd.read_csv(f, compression='gzip', error_bad_lines=False)
            df_list.append(curr_data[curr_data["EVENT_TYPE"] == 'Tornado'])

            tornado_data = pd.concat(df_list)

In [None]:
start_times = tornado_data["BEGIN_TIME"]

start_times_dict = {}
for time in start_times:
    if not (time // 100) in start_times_dict:
        start_times_dict[(time // 100)] = 0
    start_times_dict[(time // 100)] += 1
start_times_dict_sorted = sorted(start_times_dict.keys())
start_times_dict_ii = {}
for key in start_times_dict_sorted:
    if not key in start_times_dict_ii:
        start_times_dict_ii[key] = 0
    start_times_dict_ii[key] = start_times_dict[key]
    
start_times_dict_ii

In [None]:
def get_ax(figsize=(4,4)):
    fig, ax = plt.subplots(figsize=figsize)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    return ax

ax = get_ax((20, 10))
ax.set_title("When do Tornadoes Occur? (1950-2019))", size=40)
ax.set_ylabel("# of Tornadoes", fontsize=30)
ax.set_xlabel("Start Time (rounded down to nearest hour)", fontsize=30)
ax.set_xticks(list(start_times_dict_ii.keys()))
ax.tick_params(labelsize=20)

ax.bar(list(start_times_dict_ii.keys()), list(start_times_dict_ii.values()), color = "black")
None

In [None]:
tornado_data.columns

In [None]:
tornado_month = tornado_data["MONTH_NAME"]

tornado_month_dict = {}
for month in tornado_month:
    if not (month[:3]) in tornado_month_dict:
        tornado_month_dict[month[:3]] = 0
    tornado_month_dict[month[:3]] += 1
    
# from https://stackoverflow.com/questions/36596118/how-to-sort-a-python-dictionary-having-month-names-as-keys
    
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
data = tornado_month_dict
from collections import OrderedDict
tornado_month_dict_ii = OrderedDict(sorted(data.items(),key =lambda x:months.index(x[0])))
tornado_month_dict_ii

In [None]:
ax = get_ax((20, 10))
ax.set_title("# Tornadoes per Month (1950-2019)", size=40)
ax.set_ylabel("# of Tornadoes", fontsize=30)
ax.set_xlabel("Month", fontsize=25)
ax.tick_params(labelsize=15)

ax.bar(list(tornado_month_dict_ii.keys()), list(tornado_month_dict_ii.values()), color = "black")
None

In [None]:
tornado_year = tornado_data["YEAR"]

tornado_year_dict = {}
for year in tornado_year:
    if not year in tornado_year_dict:
        tornado_year_dict[year] = 0
    tornado_year_dict[year] += 1

pair_list = []
for year in tornado_year_dict:
    pair_list.append((year, tornado_year_dict[year]))
df = pd.DataFrame.from_records(pair_list, columns=["Year", "Count"])
r = LinearRegression()
columns = ["Year"]
r.fit(df[columns].values, df["Count"].values.reshape(-1,1))
slope = float(r.coef_)
intercept = float(r.intercept_)
ax.scatter(df["Year"], df["Count"])
x = np.linspace(1950, 2019, 69)
y = slope*x + intercept  

ax = get_ax((20, 10))
ax.set_title("# Tornadoes per Year (1950-2019)", size=40)
ax.set_ylabel("# of Tornadoes", fontsize=30)
ax.set_xlabel("Year", fontsize=30)
ax.tick_params(labelsize=20)

ax.scatter(list(tornado_year_dict.keys()), list(tornado_year_dict.values()), color = "black")
ax.plot(x, y, color = "red")
print(slope, intercept)
None

In [None]:
tornado_damage_type = tornado_data[["DAMAGE_PROPERTY", "TOR_F_SCALE"]]

tornado_damage_type_dict = {}

for i in range(len(tornado_damage_type)):
    if type(tornado_damage_type.iloc[i]["DAMAGE_PROPERTY"]) == str:
        current_damage = convert(tornado_damage_type.iloc[i]["DAMAGE_PROPERTY"])
        if current_damage == None:
            continue
    elif type(tornado_damage_type.iloc[i]["DAMAGE_PROPERTY"]) == int:
        current_damage = tornado_damage_type.iloc[i]["DAMAGE_PROPERTY"]
    elif type(tornado_damage_type.iloc[i]["DAMAGE_PROPERTY"]) == float: # float NaN
        continue
    else:
        current_damage = 0
    if type(tornado_damage_type.iloc[i]["TOR_F_SCALE"]) != str or tornado_damage_type.iloc[i]["TOR_F_SCALE"] == "EFU":
        continue
    if not tornado_damage_type.iloc[i]["TOR_F_SCALE"] in tornado_damage_type_dict:
        tornado_damage_type_dict[tornado_damage_type.iloc[i]["TOR_F_SCALE"]] = [0,0]
    tornado_damage_type_dict[tornado_damage_type.iloc[i]["TOR_F_SCALE"]][0] += 1
    tornado_damage_type_dict[tornado_damage_type.iloc[i]["TOR_F_SCALE"]][1] += current_damage
tornado_damage_type_dict

In [None]:
tornado_damage_type_dict_sorted = sorted(tornado_damage_type_dict.keys())
tornado_damage_type_dict_ii = {}
for key in tornado_damage_type_dict_sorted:
    if not key in tornado_damage_type_dict_ii:
        tornado_damage_type_dict_ii[key] = int(tornado_damage_type_dict[key][1] / tornado_damage_type_dict[key][0]) / 10000
tornado_damage_type_dict_ii

In [None]:
ax = get_ax((20, 10))
ax.set_title("Average Property Damage by Tornado Type (1950-2019)", size=40)
ax.set_ylabel("Avg Damage (tens of thousands)", fontsize=30)
ax.set_xlabel("Tornado Type", fontsize=30)
ax.tick_params(labelsize=20)
ax.set_yscale("log")
ax.bar(list(tornado_damage_type_dict_ii.keys()), list(tornado_damage_type_dict_ii.values()), color = "black")
None

In [None]:
tornado_states = tornado_data["STATE"].value_counts()
tornado_states = tornado_states.drop("VIRGIN ISLANDS")
tornado_states = tornado_states.drop("DISTRICT OF COLUMBIA")
tornado_states = tornado_states.drop("PUERTO RICO")
tornado_states = tornado_states.drop("ALASKA")
tornado_states = tornado_states.drop("HAWAII")
tornado_states

In [None]:
type_counts = tornado_data["TOR_F_SCALE"].str.replace("E", "").value_counts()
type_counts = type_counts.drop("FU")
type_counts_ii = type_counts[:3]
strong = type_counts[3:].sum()
type_counts_ii["F3+"] = strong
type_counts_ii

In [None]:
strong_tornado_decades = tornado_data[tornado_data["TOR_F_SCALE"].str.replace("E", "") == 
                                      ("F3" or "F4" or "F5")]["YEAR"] // 10 * 10

weak_tornado_decades = tornado_data[tornado_data["TOR_F_SCALE"].str.replace("E", "") == 
                                    ("F0" or "F1" or "F2")]["YEAR"] // 10 * 10

In [None]:
weak_tornado_decades.value_counts()

In [None]:
strong_tornado_decades.value_counts()

In [None]:
tor_type_decade_df = pd.DataFrame({
    "F3+":strong_tornado_decades.dropna().astype(int).value_counts(), 
    "other":weak_tornado_decades.dropna().astype(int).value_counts()
})
tor_type_decade_df

In [None]:
ax = tor_type_decade_df.plot.bar(stacked=True, figsize=(14, 7), color=("k", ".75"))
ax.set_ylabel("# of Tornadoes", fontsize=30)
ax.set_title("Tornado Strength vs Decade (1950-2010)", size=35)
ax.set_xlabel("Decade", fontsize=30)
ax.tick_params(labelsize=15)
ax.legend(loc=2, prop={'size': 20})
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
None

In [None]:
decade_tornado_state = tornado_data[["STATE", "YEAR"]]     
decade_tornado_state.YEAR = decade_tornado_state.YEAR // 10 * 10
decades_state_df = pd.crosstab(decade_tornado_state.STATE, decade_tornado_state.YEAR)
decades_state_df = decades_state_df.drop("VIRGIN ISLANDS")

In [None]:
def draw_frame(decade):
    us = geopandas.read_file("zip://states.zip") # march 27th lecture
    us["NAME"] = us["NAME"].str.upper()
    
    for state in decades_state_df[decade].iteritems():
        color = "white"
        if state[1] >= 300/7:
            color = "lightyellow"
        if state[1] >= 500/7:
            color = "palegoldenrod"
        if state[1] >= 1000/7:
            color = "lightsalmon"
        if state[1] >= 2000/7:
            color = "coral"
        if state[1] >= 2500/7:
            color = "red"
        if state[1] >= 4000/7:
            color = "brown"
        if state[1] >= 5000/7:
            color = 'darkred'
        
        for i in range(len(us)):
            if us.iloc[i]["NAME"] == state[0]:
                us.at[i, decade] = str(color)
            
    ax = fig.add_subplot()
    ax.set_title(str(decade)+"s", fontsize=30)
    continent = us[~us["STUSPS"].isin(["AK", "HI", "PR"])]

    continent.plot(color="white", edgecolor="k", ax=ax)

    for ax in fig.axes:
        ax.axis("off")

    continent.plot(color=continent[decade], edgecolor="k", legend=True, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
anim = FuncAnimation(fig, func=draw_frame, frames=[1950, 1960, 1970, 1980, 1990, 2000, 2010], interval=750)
html = anim.to_html5_video()
with open("final_project_visualization.html", "w") as f:
     f.write(html)
plt.close(fig)
display(HTML(html))

In [None]:
length_width_type = tornado_data[["TOR_LENGTH", "TOR_WIDTH", "TOR_F_SCALE"]]
length_width_type.TOR_F_SCALE = length_width_type.TOR_F_SCALE.str.replace("E", "")
length_width_type.TOR_LENGTH = length_width_type.TOR_LENGTH
length_width_type.TOR_WIDTH = length_width_type.TOR_WIDTH

In [None]:
injuries_type = tornado_data[["TOR_F_SCALE", "INJURIES_DIRECT"]]
injuries_type = pd.crosstab(injuries_type.INJURIES_DIRECT, injuries_type.TOR_F_SCALE)

In [None]:
newer_tornadoes = tornado_data[tornado_data["YEAR"] >= 2000]
newer_tornadoes_length_width = newer_tornadoes[["TOR_WIDTH", "TOR_LENGTH", "TOR_F_SCALE", "DEATHS_DIRECT", 
                                                "INJURIES_DIRECT", "DAMAGE_PROPERTY", "YEAR"]]
newer_tornadoes_length_width.TOR_F_SCALE = newer_tornadoes_length_width.TOR_F_SCALE.str.replace("E", "")

In [None]:
deaths_per_year = tornado_data[["YEAR", "DEATHS_DIRECT"]]

In [None]:
year_dict = {}
for year in deaths_per_year["YEAR"]:
    if not year in year_dict:
        year_dict[year] = 0
for year in year_dict:
    current_df = deaths_per_year[deaths_per_year["YEAR"] == year]
    for deaths in current_df["DEATHS_DIRECT"]:
        year_dict[year] += deaths

deaths_per_year_series = pd.Series(year_dict)

decade_dict = {}

for year in deaths_per_year_series.keys():
    decade = year // 10 * 10
    if not decade+5 in decade_dict:
        decade_dict[decade+5] = 0
    decade_dict[decade+5] += deaths_per_year_series[year] / 10
    

deaths_per_decade_series = pd.Series(decade_dict)

In [None]:
ax = get_ax((15, 10))
ax.set_ylabel("# of Deaths", fontsize=20)
ax.set_title("Tornado Fatalities by Year (1950-2019)", size=30)
ax.set_xlabel("Year", fontsize=20)
deaths_per_year_series.plot(ax=ax, color = "k", label="Single Year")
deaths_per_decade_series.plot(ax=ax, color = "red", label="10 Year Avg")
ax.legend(loc="best", fontsize=15)
None

In [None]:
local_tornadoes = tornado_data[tornado_data["STATE"]=="WISCONSIN"]
local_tornadoes = local_tornadoes[local_tornadoes["YEAR"]>=2000]

In [None]:
locations_local_tornadoes = local_tornadoes[["BEGIN_LAT", "BEGIN_LON", "TOR_F_SCALE"]]
locations_local_tornadoes["TOR_F_SCALE"] = locations_local_tornadoes["TOR_F_SCALE"].str.replace("E", "")


In [None]:
import matplotlib

us = geopandas.read_file("zip://states.zip") # march 27th lecture
us["NAME"] = us["NAME"].str.upper()

df = locations_local_tornadoes
gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.BEGIN_LON, df.BEGIN_LAT))
gdf = gdf.reset_index()
gdf["color"] = gdf["TOR_F_SCALE"]
new_colors_list = []
for i in range(len(gdf["color"])):
    color = gdf.iloc[i]["color"]
    color = color[1:]
    if color == "U":
        color = "white"
    elif int(color) == 0:
        color = "lightsalmon"
    elif int(color) == 1:
        color = "red"
    elif int(color) == 2:
        color = "darkred"
    elif int(color) > 2:
        color = "k"
    else:
        color = "white"
    gdf.at[i, "color"] = color


In [None]:
from matplotlib.lines import Line2D

ax = us[us.NAME == 'WISCONSIN'].plot(color='white', edgecolor='black', figsize=((10, 15)))
ax.set_title("Wisconsin Tornado Locations (2000-present)", fontsize=30)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.axis("off")
legend_elements = [Line2D([0], [0], marker='o', color='lightsalmon', label="F0",
                         markerfacecolor="lightsalmon", markersize=5),
                   Line2D([0], [0], marker='o', color='red', label='F1', 
                          markerfacecolor='red', markersize=5),
                   Line2D([0], [0], marker='o', color='darkred', label='F2', 
                          markerfacecolor='darkred', markersize=5),
                   Line2D([0], [0], marker='o', color='k', label='F3+', 
                          markerfacecolor='k', markersize=5)]
ax.legend(handles=legend_elements, loc='best', fontsize=15)
            

gdf.plot(ax=ax, marker="o", markersize=40, color=gdf["color"], label=gdf["color"])

None

In [None]:
usa_tornadoes_2000 = tornado_data[["BEGIN_LAT", "BEGIN_LON", "TOR_F_SCALE", "YEAR", "STATE"]]
usa_tornadoes_2000["TOR_F_SCALE"] = usa_tornadoes_2000["TOR_F_SCALE"].str.replace("E", "")
usa_tornadoes_2000 = usa_tornadoes_2000[usa_tornadoes_2000["YEAR"] >= 2000]
usa_tornadoes_2000 = usa_tornadoes_2000[usa_tornadoes_2000["STATE"]!=("ALASKA")]
usa_tornadoes_2000 = usa_tornadoes_2000[usa_tornadoes_2000["STATE"]!=("HAWAII")]
usa_tornadoes_2000 = usa_tornadoes_2000[usa_tornadoes_2000["STATE"]!=("VIRGIN ISLANDS")]
usa_tornadoes_2000 = usa_tornadoes_2000[usa_tornadoes_2000["STATE"]!=("PUERTO RICO")]

In [None]:
us_2000 = geopandas.read_file("zip://states.zip") # march 27th lecture
us_2000["NAME"] = us_2000["NAME"].str.upper()

df = usa_tornadoes_2000
gdf_us = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.BEGIN_LON, df.BEGIN_LAT))
gdf_us = gdf_us.reset_index()
gdf_us["color"] = gdf_us["TOR_F_SCALE"]
gdf_us["marker"] = gdf_us["TOR_F_SCALE"]
for i in range(len(gdf_us["color"])):
    color = gdf_us.iloc[i]["color"]
    try:
        color = color[1:]
    except:
        color = "white"
        gdf_us.at[i, "color"] = matplotlib.colors.to_rgba(color, alpha=0.0)
        gdf_us.at[i, "marker"] = "."
        continue
    if color == "U":
        color = "white"
        alpha = 0.0
        marker = "."
    elif int(color) == 0:
        color = "lightyellow"
        alpha = .1
        marker = "."
    elif int(color) == 1:
        color = "lightsalmon"
        alpha = .15
        marker = "."
    elif int(color) == 2:
        color = "tomato"
        alpha = .2
        marker = "."
    elif int(color) == 3:
        color = "red"
        alpha = .5
        marker = "o"
    elif int(color) == 4:
        color = "darkred"
        alpha = 1.0
        marker = "o"
    elif int(color) == 5:
        color = "k"
        alpha = 1.0
        marker = "s"
    else:
        color = "white"
        marker = "."
    gdf_us.at[i, "color"] = matplotlib.colors.to_rgba(color, alpha=alpha)
    gdf_us.at[i, "marker"] = marker

In [None]:
us_2000 = us_2000[~us_2000["STUSPS"].isin(["AK", "HI", "PR"])]
ax = us_2000.plot(color='white', edgecolor='black', figsize=((15,20)))
ax.set_title("United States Tornado Locations (2000-present)", fontsize=30)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.axis("off")
legend_elements = [Line2D([0], [0], marker='o', color='lightyellow', label="F0",
                         markerfacecolor="lightyellow", markersize=5),
                   Line2D([0], [0], marker='o', color='lightsalmon', label='F1', 
                          markerfacecolor='lightsalmon', markersize=5),
                   Line2D([0], [0], marker='o', color='tomato', label='F2', 
                          markerfacecolor='tomato', markersize=5),
                   Line2D([0], [0], marker='o', color='red', label='F3', 
                          markerfacecolor='red', markersize=5),
                   Line2D([0], [0], marker='o', color='darkred', label='F4', 
                          markerfacecolor='darkred', markersize=5),
                   Line2D([0], [0], marker='o', color='k', label='F5', 
                          markerfacecolor='k', markersize=5)]
ax.legend(handles=legend_elements, loc='best', fontsize=15)
            
gdf_us.plot(ax=ax, marker="o", markersize=5, color=gdf_us["color"], label=gdf_us["color"])

None