In [1]:
# Import necessary libaries 

import pandas as pd
import requests
import pygeohash as geohash

api_key = '86616bb6e791a265c60e907399c20a3f'


In [2]:
#Load the dataset
df = pd.read_csv('US_Accidents_March23.csv.zip')

In [3]:
#Drop missing values in dataset
df.dropna(inplace=True)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3554549 entries, 3402762 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)      

In [4]:
print(df.describe())

           Severity     Start_Lat     Start_Lng       End_Lat       End_Lng  \
count  3.554549e+06  3.554549e+06  3.554549e+06  3.554549e+06  3.554549e+06   
mean   2.075250e+00  3.612065e+01 -9.521825e+01  3.612085e+01 -9.521794e+01   
std    3.837690e-01  5.352725e+00  1.801263e+01  5.352925e+00  1.801231e+01   
min    1.000000e+00  2.456603e+01 -1.245481e+02  2.456601e+01 -1.245457e+02   
25%    2.000000e+00  3.318008e+01 -1.175560e+02  3.317923e+01 -1.175546e+02   
50%    2.000000e+00  3.604521e+01 -8.731360e+01  3.604546e+01 -8.731348e+01   
75%    2.000000e+00  4.013914e+01 -8.021014e+01  4.013945e+01 -8.020993e+01   
max    4.000000e+00  4.900058e+01 -6.748413e+01  4.907500e+01 -6.748413e+01   

       Distance(mi)  Temperature(F)  Wind_Chill(F)   Humidity(%)  \
count  3.554549e+06    3.554549e+06   3.554549e+06  3.554549e+06   
mean   8.477587e-01    6.107390e+01   5.974164e+01  6.377543e+01   
std    1.823782e+00    1.939819e+01   2.168704e+01  2.292699e+01   
min    0.000000e

In [5]:
print(df.head())

                ID   Source  Severity           Start_Time  \
3402762  A-3412645  Source1         3  2016-02-08 00:37:08   
3402767  A-3412650  Source1         3  2016-02-08 07:53:43   
3402771  A-3412654  Source1         2  2016-02-08 11:51:46   
3402773  A-3412656  Source1         2  2016-02-08 15:16:43   
3402774  A-3412657  Source1         2  2016-02-08 15:43:50   

                    End_Time  Start_Lat  Start_Lng    End_Lat    End_Lng  \
3402762  2016-02-08 06:37:08  40.108910 -83.092860  40.112060 -83.031870   
3402767  2016-02-08 13:53:43  39.172393 -84.492792  39.170476 -84.501798   
3402771  2016-02-08 17:51:46  41.375310 -81.820170  41.367860 -81.821740   
3402773  2016-02-08 21:16:43  40.109310 -82.968490  40.110780 -82.984000   
3402774  2016-02-08 21:43:50  39.192880 -84.477230  39.196150 -84.473350   

         Distance(mi)  ... Roundabout Station   Stop Traffic_Calming  \
3402762         3.230  ...      False   False  False           False   
3402767         0.500  ...

In [6]:
# Convert timestamps to datetime format, accounting for fractional seconds
df['start_time'] = pd.to_datetime(df['Start_Time'], errors='coerce', format="%Y-%m-%d %H:%M:%S.%f")
df['end_time'] = pd.to_datetime(df['End_Time'], errors='coerce', format="%Y-%m-%d %H:%M:%S.%f")
df['weather_timestamp'] = pd.to_datetime(df['Weather_Timestamp'], errors='coerce', format="%Y-%m-%d %H:%M:%S.%f")


In [7]:
#Drop unecessary columns 
df = df.drop(columns=['ID', 'Source'])

In [8]:
df = df.drop_duplicates()

In [9]:
#Feature Engineering

# Time based features: Day of the wee, hour of the day, is it a weekend/holiday 
df['hour_of_day'] = df['weather_timestamp'].dt.hour
df['day_of_week'] = df['start_time'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int) 
df['month_of_year'] = df['weather_timestamp'].dt.month

#Season
def get_season(month):
    if month in[12,1,2]:
        return 'Winter'
    elif month in [3,4,5]:
        return 'Spring'
    elif month in [6,7,8]:
        return 'Summer'
    else:
        return 'Fall'
    
df['season'] = df['month_of_year'].apply(get_season)

In [10]:
#Location based features 
df['Zip_Code'] = df['Zipcode']
df['State'] = df['State']
df['County'] = df['County']

#Using geohash , to group incidents based on geographical locations
df['geohash'] = df.apply(lambda row: geohash.encode(row['Start_Lat'], row['Start_Lng']), axis=1)

#Incident Severity Features 
df['incident duration'] = (df['end_time'] - df['start_time']).dt.total_seconds() / 60

#Severity level
df['severity_level'] = df['Severity'].apply(lambda x: 'high' if x>=3 else 'low')

#Categorize wind speed 

#Wind speed needs to be numeric 
df ['Wind_Speed(mph)'] = pd.to_numeric(df['Wind_Speed(mph)'], errors='coerce')

df = df.dropna(subset=['Wind_Speed(mph)'])

df['wind_speed_category'] = pd.cut(
    df['Wind_Speed(mph)'],
    bins=[0, 10, 20, 100],
    labels=['Low', 'Medium', 'High'])


#Binary feature for visibility
df['low_visibilty'] = df['Visibility(mi)'].apply(lambda x: 1 if x < 1000 else 0)

#Number of incidents per location
df['incident_count'] = df.groupby('Zip_Code')['Severity'].transform('count')

In [12]:
def get_weather_data(lat, lon, api_key):
    url = f'http://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={api_key}&units=metric'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

In [13]:
def fetch_weather_for_incidents(df, api_key):
    weather_features = []

    for index, row in df.iterrows():
        lat = row['Start_Lat']
        lon = row['Start_Lng']

        #Fetch the weather data for this incident location
        weather_data = get_weather_data(lat, lon, api_key)

        if weather_data:
                weather_features.append({
                'temperature': weather_data['main']['temp'],
                'humidity': weather_data['main']['humidity'],
                'weather_description': weather_data['weather'][0]['description'],
                'wind_speed': weather_data['wind']['speed'],
                'visibility': weather_data.get('visibility', None)
            })
        else:
            #If API fails, add NaN or default values for weather features 
            weather_features.append({
                'temperature': None,
                'humidity': None,
                'weather_description': None,
                'wind_speed':None,
                'visibility': None
            })

    #Add the weather features to the DataFrame
    weather_df = pd.DataFrame(weather_features)
    df = pd.concat([df, weather_df], axis=1)

    return df


In [None]:
df = fetch_weather_for_incidents(df, api_key)

print(df.head())
