In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# Load the data and check first five roads
df = pd.read_csv("Resources/tornados.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'tornados.csv'

In [None]:
# Check how many columns and rows
df.shape

In [None]:
# Check data type and missing values
df.info()

In [None]:
# Check for duplicates
df.duplicated().sum()

In [None]:
# Drop duplicate row
df.drop_duplicates(inplace = True)


In [None]:
# To mkae sure duplicates were dropped
df.duplicated().sum()

In [None]:
# Check missing values by column
df.isna().sum()

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)

In [None]:
# Check mag column has no missing values
df['mag'].isnull().sum()

In [None]:
# Check loss column has no missing values
df['loss'].isnull().sum()

In [None]:
# Another check missing values by column
df.isna().sum()

In [None]:
# Rechecking missing values
df.info()

In [None]:
# Creating dictionary for US regions
state_regions = {
    'AL': 'Southeast',
    'AK': 'West',
    'AZ': 'West',
    'AR': 'South',
    'CA': 'West',
    'CO': 'West',
    'CT': 'Northeast',
    'DE': 'Northeast',
    'FL': 'Southeast',
    'GA': 'Southeast',
    'HI': 'West',
    'ID': 'West',
    'IL': 'Midwest',
    'IN': 'Midwest',
    'IA': 'Midwest',
    'KS': 'Midwest',
    'KY': 'South',
    'LA': 'South',
    'ME': 'Northeast',
    'MD': 'Northeast',
    'MA': 'Northeast',
    'MI': 'Midwest',
    'MN': 'Midwest',
    'MS': 'South',
    'MO': 'Midwest',
    'MT': 'West',
    'NE': 'Midwest',
    'NV': 'West',
    'NH': 'Northeast',
    'NJ': 'Northeast',
    'NM': 'West',
    'NY': 'Northeast',
    'NC': 'Southeast',
    'ND': 'Midwest',
    'OH': 'Midwest',
    'OK': 'South',
    'OR': 'West',
    'PA': 'Northeast',
    'RI': 'Northeast',
    'SC': 'Southeast',
    'SD': 'Midwest',
    'TN': 'South',
    'TX': 'South',
    'UT': 'West',
    'VT': 'Northeast',
    'VA': 'Southeast',
    'WA': 'West',
    'WV': 'South',
    'WI': 'Midwest',
    'WY': 'West',
    'DC': 'Northeast',
    'PR': 'Southeast'
}

In [None]:
# Add region column to dataframe
df["region"] = [state_regions[x] for x in df.st]
df.head()

In [None]:
# Change the data type of the date column
df["date"]=pd.to_datetime(df["date"])

In [None]:
# Create a new column month
df['month'] = df['date'].dt.month_name()

In [None]:
# Check for new columns: region, month and num tornadoes
df.info()

In [None]:
df.month.value_counts()

In [None]:
sorted(df.month.unique())

In [None]:
df.sort_values(by="mo").month.unique()

In [None]:
df.region.value_counts()

In [None]:
df.region.unique()

In [None]:
df.num_tornados.value_counts()

In [None]:
df.sort_values(by="num_tornados").num_tornados.unique()

In [None]:
df.num_tornados.unique()

In [None]:
df.info()

In [None]:
df["st"].unique()

In [None]:
#Plot number of tornados by state
df.st.value_counts().plot(kind='bar', color=sns.color_palette('crest'), figsize=(10, 7))
plt.ylabel("Count")
plt.xlabel("State")
plt.title("Number of Tornadoes by State")

In [None]:
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']


In [None]:
df['month'] = pd.Categorical(df['month'], categories=month_order, ordered=True)

In [None]:
#Plot tornados magnitude by month
plt.figure(figsize=(10, 7))
sns.boxplot(data=df, x='month', y='mag', palette = 'crest')
plt.xlabel('Month')
plt.ylabel('Magnitude')
plt.title('Box Plot: Magnitude by Month')
plt.xticks(rotation=90)
plt.show()

In [None]:
num_tornados= df.st.value_counts()
num_tornados

In [None]:
state_tornados = {
'TX':'4601',
'FL':'2585',
'OK':'2499',
'MS':'2209',
'IA':'1897',
'LA':'1762',
'KS':'1728',
'MO':'1711',
'AL':'1687',
'GA':'1618',
'NE':'1547',
'IL':'1407',
'AR':'1310',
'IN':'1189',
'TN':'1156',
'OH':'1088',
'NC':'1070',
'WI':'1048',
'KY':'904',
'MN':'902',
'MI':'881',
'SC':'722',
'VA':'705',
'PA':'698',
'SD':'658',
'ND':'561',
'CO':'530',
'NY':'410',
'MD':'324',
'CA':'260',
'WY':'244',
'NM':'218',
'MT':'161',
'MA':'158',
'WV':'124',
'AZ':'116',
'ID':'111',
'NJ':'105',
'CT':'89',
'ME':'79',
'NH':'74',
'WA':'73',
'OR':'71',
'UT':'66',
'DE':'46',
'VT':'45',
'PR':'20',
'HI':'18',
'NV':'18',
'RI':'10',
'DC':'2',  
'AK':'1' 
    
}

In [None]:
df["num_tornados"] = df["st"].map(num_tornados)

In [None]:
region_state_data = df.groupby(['region', 'st'])['num_tornados'].value_counts().reset_index()

In [None]:
region_state_data

In [None]:
fig = px.sunburst(region_state_data, path=['region', 'st'], values='num_tornados', color_discrete_sequence=['#7DBA91','#277A8C','#3F908E','#1B6488','#5AA590','#244B7F'])
fig.update_layout(
    margin=dict(t=0, l=0, r=0, b=0),
    title='Tornados by State and Region'
)

fig.show()

In [None]:
df.to_csv("tornados.csv", index=False)