In [26]:
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import numpy as np
import time


##### Load and inspect dataset

In [None]:
weather_detail_df = pd.read_csv('../Datasets/Original Files/Weather_Detail.csv')
weather_detail_df.head()

In [28]:
weather_detail_df.shape

(78129, 33)

In [29]:
weather_detail_df.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'DAPR', 'MDPR', 'PGTM', 'PRCP', 'PSUN', 'SNOW', 'SNWD', 'TAVG', 'TMAX',
       'TMIN', 'TOBS', 'TSUN', 'WDF2', 'WDF5', 'WESD', 'WESF', 'WSF2', 'WSF5',
       'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT08', 'WT09'],
      dtype='object')

In [30]:
# Drop columns DAPR MDPR PGTM PSUN TOBS TSUN WESD WESF WT01 WT02 WT03 WT04 WT05 WT06 WT08 WT09

weather_detail_df.drop(['DAPR','MDPR','PGTM','PSUN','TAVG','TOBS','TSUN','WESD','WESF','WT01','WT02','WT03','WT04','WT05','WT06','WT08','WT09'], axis = 1, inplace =True)


In [None]:
# making sure columns dropped

weather_detail_df

##### Review Data types, look for nulls, and duplicates

In [None]:
weather_detail_df.isnull().sum()

In [33]:
#dropping rows that have nulls in the TMAX column - Since TMAX is essential for the analysis, rows without a recorded max temperature aren't useful

weather_detail_df.dropna(subset=['TMAX'], inplace=True)



In [34]:
# resetting index after drop 

weather_detail_df = weather_detail_df.reset_index(drop=True)

In [None]:
weather_detail_df.isnull().sum()

In [36]:
#Replacing remaing nulls with zero since it's likely that there was no data to record for that day 

weather_detail_df.fillna(0,inplace=True)  

In [None]:
weather_detail_df.isnull().sum() #checking to make sure there are no nulls

In [20]:
duplicates = weather_detail_df.duplicated()
weather_detail_df[duplicates]

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5


In [None]:
weather_detail_df.dtypes

In [None]:
weather_detail_df = weather_detail_df.rename(columns={"LATITUDE": "BEGIN_LAT", "LONGITUDE": "BEGIN_LON"})
weather_detail_df

##### Reverse Geocoding to return zipcode using latitude and longitude

In [39]:
# There were too many rows to loop through like I did for the tornado tracks.  So I found the unique coordinates, put those in a dictionary, and looped through those to find the zipcodes 

unique_coords = weather_detail_df[['BEGIN_LAT','BEGIN_LON']].drop_duplicates()

unique_coords = unique_coords.reset_index(drop=True) #reset index 

unique_coords




Unnamed: 0,BEGIN_LAT,BEGIN_LON
0,38.23021,-85.66294
1,38.2778,-85.7918
2,37.9161,-85.6572
3,38.1151,-85.6445
4,38.17738,-85.73077


In [40]:
geolocator = Nominatim(user_agent="tornado_project")

# Dictionary to store results
coord_to_zip = {}

for i in range(len(unique_coords)):
    try:
        lat = unique_coords.loc[i, 'BEGIN_LAT']
        lon = unique_coords.loc[i, 'BEGIN_LON']
        location = geolocator.reverse(f"{lat}, {lon}", timeout=10)
        address = location.raw.get('address', {})
        zipcode = address.get('postcode', np.nan)

        # Save into dictionary
        coord_to_zip[(lat, lon)] = zipcode

        # Store in DataFrame
        unique_coords.loc[i, 'Zip'] = zipcode

    except Exception as e:
        print(f"Error at row {i}: {e}")
        coord_to_zip[(lat, lon)] = np.nan
        unique_coords.loc[i, 'Zip'] = np.nan

    time.sleep(1)  # respect Nominatim rate limits






In [41]:
# to merge the unique coords and weather detail df 

weather_detail_df = weather_detail_df.merge(unique_coords, on=['BEGIN_LAT','BEGIN_LON'], how='left')
weather_detail_df

Unnamed: 0,STATION,NAME,BEGIN_LAT,BEGIN_LON,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,Zip
0,USW00013810,"LOUISVILLE BOWMAN FIELD, KY US",38.23021,-85.66294,164.6,1/1/2015,8.05,0.00,0.0,0.0,40.0,22.0,230.0,250.0,19.9,23.9,40041
1,USC00154955,"LOUISVILLE MCALPINE, KY US",38.27780,-85.79180,134.1,1/1/2015,0.00,0.00,0.0,0.0,28.0,20.0,0.0,0.0,0.0,0.0,40212
2,USC00150630,"BERNHEIM FOREST, KY US",37.91610,-85.65720,167.6,1/1/2015,0.00,0.02,0.0,0.0,41.0,16.0,0.0,0.0,0.0,0.0,40110
3,USC00154958,"LOUISVILLE WEATHER FORECAST OFFICE, KY US",38.11510,-85.64450,192.6,1/1/2015,0.00,0.00,0.0,0.0,40.0,19.0,0.0,0.0,0.0,0.0,40229
4,USW00093821,"LOUISVILLE INTERNATIONAL AIRPORT, KY US",38.17738,-85.73077,146.3,1/1/2015,9.17,0.00,0.0,0.0,41.0,23.0,230.0,240.0,21.0,25.1,40209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17718,USC00154955,"LOUISVILLE MCALPINE, KY US",38.27780,-85.79180,134.1,9/30/2025,0.00,0.00,0.0,0.0,87.0,63.0,0.0,0.0,0.0,0.0,40212
17719,USW00093821,"LOUISVILLE INTERNATIONAL AIRPORT, KY US",38.17738,-85.73077,146.3,9/30/2025,0.00,0.00,0.0,0.0,87.0,63.0,0.0,0.0,0.0,0.0,40209
17720,USW00013810,"LOUISVILLE BOWMAN FIELD, KY US",38.23021,-85.66294,164.6,10/1/2025,0.00,0.00,0.0,0.0,86.0,57.0,0.0,0.0,0.0,0.0,40041
17721,USC00154955,"LOUISVILLE MCALPINE, KY US",38.27780,-85.79180,134.1,10/1/2025,0.00,0.00,0.0,0.0,87.0,64.0,0.0,0.0,0.0,0.0,40212


In [43]:
weather_detail_df["DATE"] = pd.to_datetime(
    weather_detail_df["DATE"], format="%m/%d/%Y")


In [44]:
weather_detail_df

Unnamed: 0,STATION,NAME,BEGIN_LAT,BEGIN_LON,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,Zip
0,USW00013810,"LOUISVILLE BOWMAN FIELD, KY US",38.23021,-85.66294,164.6,2015-01-01,8.05,0.00,0.0,0.0,40.0,22.0,230.0,250.0,19.9,23.9,40041
1,USC00154955,"LOUISVILLE MCALPINE, KY US",38.27780,-85.79180,134.1,2015-01-01,0.00,0.00,0.0,0.0,28.0,20.0,0.0,0.0,0.0,0.0,40212
2,USC00150630,"BERNHEIM FOREST, KY US",37.91610,-85.65720,167.6,2015-01-01,0.00,0.02,0.0,0.0,41.0,16.0,0.0,0.0,0.0,0.0,40110
3,USC00154958,"LOUISVILLE WEATHER FORECAST OFFICE, KY US",38.11510,-85.64450,192.6,2015-01-01,0.00,0.00,0.0,0.0,40.0,19.0,0.0,0.0,0.0,0.0,40229
4,USW00093821,"LOUISVILLE INTERNATIONAL AIRPORT, KY US",38.17738,-85.73077,146.3,2015-01-01,9.17,0.00,0.0,0.0,41.0,23.0,230.0,240.0,21.0,25.1,40209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17718,USC00154955,"LOUISVILLE MCALPINE, KY US",38.27780,-85.79180,134.1,2025-09-30,0.00,0.00,0.0,0.0,87.0,63.0,0.0,0.0,0.0,0.0,40212
17719,USW00093821,"LOUISVILLE INTERNATIONAL AIRPORT, KY US",38.17738,-85.73077,146.3,2025-09-30,0.00,0.00,0.0,0.0,87.0,63.0,0.0,0.0,0.0,0.0,40209
17720,USW00013810,"LOUISVILLE BOWMAN FIELD, KY US",38.23021,-85.66294,164.6,2025-10-01,0.00,0.00,0.0,0.0,86.0,57.0,0.0,0.0,0.0,0.0,40041
17721,USC00154955,"LOUISVILLE MCALPINE, KY US",38.27780,-85.79180,134.1,2025-10-01,0.00,0.00,0.0,0.0,87.0,64.0,0.0,0.0,0.0,0.0,40212


##### Save DF into new CSV

In [45]:
weather_detail_df.to_csv("weather_detail_cleaned.csv", index=False)