In [61]:
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter




## Data cleaning for dataset 2015 - 2019

##### Load and Inspect Dataset

In [None]:
tornado_tracks_df = pd.read_csv('../Datasets/Tornado_tracks_2015_2019.csv')
tornado_tracks_df.head()

In [88]:
tornado_tracks_df.shape

(291, 39)

In [None]:
tornado_tracks_df.columns

In [None]:
tornado_tracks_df

##### Review Data types, look for nulls, and duplicates

In [None]:
tornado_tracks_df.isnull().sum()

In [None]:
tornado_tracks_df.dtypes

##### Reverse Geocoding to return zipcode using latitude and longitude

In [37]:
geolocator = Nominatim(user_agent="tornado_project")
geolocator

<geopy.geocoders.nominatim.Nominatim at 0x1395bbb8910>

In [97]:
tornado_tracks_df['Zip'] = None

for i in range(len(tornado_tracks_df)):
    location = geolocator.reverse(str(tornado_tracks_df.loc[i, 'BEGIN_LAT']) + ', ' + str(tornado_tracks_df.loc[i, 'BEGIN_LON']),timeout=10)
    address = location.raw['address']
    tornado_tracks_df.loc[i, 'Zip'] = address.get('postcode')




In [None]:
tornado_tracks_df

In [None]:
tornado_tracks_df['Zip'].unique()  # checking to see if there are columns in the zip column with "none"

In [None]:
# Remove rows where zipcode is None or empty string
tornado_tracks_df = tornado_tracks_df[
    tornado_tracks_df['Zip'].notna() & (tornado_tracks_df['Zip'] != '')
]

tornado_tracks_df

In [None]:
tornado_tracks_df['Zip'].unique()

#### Save DF to new CSV

In [134]:
tornado_tracks_df.to_csv("tornado_tracks_2015_2019_cleaned.csv", index=False)

## Data cleaning for the dataset 2020 - 2025

##### Load and Inspect Data

In [None]:
tornado_tracks_2020_2025_df = pd.read_csv('../Datasets/Tornado_tracks_2020_2025.csv')
tornado_tracks_2020_2025_df.head()

In [143]:
tornado_tracks_2020_2025_df.shape

(258, 39)

In [None]:
pd.set_option('display.max_rows', None)

tornado_tracks_2020_2025_df

##### Review Data types, look for nulls, and duplicates

In [None]:
tornado_tracks_2020_2025_df.dtypes

In [110]:
# converting the columns below into floats so that geocoding doesn't throw any errors. 

for col in ['BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON']:
    tornado_tracks_2020_2025_df[col] = pd.to_numeric(
        tornado_tracks_2020_2025_df[col], errors='coerce')


In [None]:
#verifying that it changed 

tornado_tracks_2020_2025_df.dtypes

In [None]:
# Finding the columns that have null values 

tornado_tracks_2020_2025_df.isnull().sum()

In [None]:
# dropping all columns with nulls with the exception of the event_narrative column

tornado_tracks_2020_2025_df.dropna(subset=["BEGIN_DATE","BEGIN_LAT","BEGIN_LON","END_LAT","END_LON"], inplace=True)

tornado_tracks_2020_2025_df.isnull().sum()

In [114]:
# reset index after dropping because it could cause keys errors in later scripts 

tornado_tracks_2020_2025_df = tornado_tracks_2020_2025_df.reset_index(drop=True)

In [None]:
tornado_tracks_2020_2025_df

##### Reverse Geocoding to return zipcode using latitude and longitude

In [118]:
geolocator = Nominatim(user_agent="tornado_project", timeout=30)
geolocator

<geopy.geocoders.nominatim.Nominatim at 0x1395bd7a9c0>

In [119]:
# This code was originally throwing an error.  I changed tornado_tracks_2020_2025_df['Zip'] = 0 to tornado_tracks_2020_2025_df['Zip'] = None and went back and changed the datatype for the lat and lon columns to floats.  I also had to reset the index after I dropped rows because that was also creating an error. 


tornado_tracks_2020_2025_df['Zip'] = None

for i in range(len(tornado_tracks_2020_2025_df)):
    location = geolocator.reverse(str(tornado_tracks_2020_2025_df.loc[i, 'BEGIN_LAT']) + ', ' + str(tornado_tracks_2020_2025_df.loc[i, 'BEGIN_LON']),timeout=10)
    address = location.raw['address']
    tornado_tracks_2020_2025_df.loc[i, 'Zip'] = address.get('postcode')


In [None]:
tornado_tracks_2020_2025_df

In [121]:
tornado_tracks_2020_2025_df['Zip'].unique()

array(['40207', '40214', '40222', None, '40059', '40031', '40165',
       '40299', '40205', '40228', '40241', '40242', '40220', '40206',
       '40216', '40218', '40272', '40258', '40203', '40204', '40211',
       '40209', '40213', '40041', '40229', '40212', '40291', '40026',
       '40077', '40014', '40129', '40223', '40245', '40208', '40118',
       '40231', '40109', '40292', '40047', '40110', '40023', '47130',
       '40243', '40150', '40177', '40071'], dtype=object)

In [None]:
# Remove rows where zipcode is None or empty string
tornado_tracks_2020_2025_df = tornado_tracks_2020_2025_df[
    tornado_tracks_2020_2025_df['Zip'].notna() & (tornado_tracks_2020_2025_df['Zip'] != '')
]

tornado_tracks_2020_2025_df

In [None]:
tornado_tracks_2020_2025_df = tornado_tracks_2020_2025_df.reset_index(drop=True)
tornado_tracks_2020_2025_df

##### Save DF to new CSV

In [133]:
tornado_tracks_2020_2025_df.to_csv("tornado_tracks_2020_2025_cleaned.csv", index=False)

#### Combine both CSVs  -- Hindsite I should have done this at the very beginning 

In [169]:
df1 = pd.read_csv('../Datasets/tornado_tracks_2015_2019_cleaned.csv')
df2 = pd.read_csv('../Datasets/tornado_tracks_2020_2025_cleaned.csv')

combined_tornado_tracks_df = pd.concat([df1, df2], ignore_index=True)

In [None]:
combined_tornado_tracks_df.head()


In [None]:
pd.set_option('display.max_columns', None)
combined_tornado_tracks_df.head()

In [None]:
combined_tornado_tracks_df = combined_tornado_tracks_df.drop(columns=["DEATHS_DIRECT","INJURIES_DIRECT","DAMAGE_PROPERTY_NUM","DAMAGE_CROPS_NUM","STATE_ABBR","CZ_TIMEZONE","EPISODE_ID","CZ_TYPE","CZ_FIPS","WFO","FLOOD_CAUSE","EVENT_ID","INJURIES_INDIRECT","DEATHS_INDIRECT","ABSOLUTE_ROWNUMBER","BEGIN_RANGE","END_RANGE"])

In [None]:
combined_tornado_tracks_df

In [175]:
combined_tornado_tracks_df.to_csv("tornado_tracks_combined_cleaned.csv", index=False)