In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('US_Accidents_Dec21_updated.csv')
cities = pd.read_csv('uscities.csv')
cities.shape

(28338, 17)

In [3]:
cities = cities[cities['population'] >= 10000]
cities.shape

(4167, 17)

In [4]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed')
df['End_Time'] = pd.to_datetime(df['End_Time'], format='mixed')

In [5]:
df['accident_hour'] = df['Start_Time'].dt.hour
df['accident_dow'] = df['Start_Time'].dt.dayofweek

In [6]:
df['accident_duration'] = (df['End_Time'] - df['Start_Time']).astype('timedelta64[s]').astype(int)

In [7]:
from __future__ import division
from math import atan
from math import atan2
from math import cos
from math import radians
from math import sin
from math import sqrt
from math import tan

def vincenty_inverse(coord1, coord2, maxIter=10, tol=10**-12):
    a = 6378137.0  # radius at equator in meters (WGS-84)
    f = 1 / 298.257223563  # flattening of the ellipsoid (WGS-84)
    b = (1 - f) * a

    (
        phi_1,
        L_1,
    ) = coord1  # (lat=L_?,lon=phi_?)
    (
        phi_2,
        L_2,
    ) = coord2

    u_1 = atan((1 - f) * tan(radians(phi_1)))
    u_2 = atan((1 - f) * tan(radians(phi_2)))

    L = radians(L_2 - L_1)

    Lambda = L  # set initial value of lambda to L

    sin_u1 = sin(u_1)
    cos_u1 = cos(u_1)
    sin_u2 = sin(u_2)
    cos_u2 = cos(u_2)

    # --- BEGIN ITERATIONS -----------------------------+
    iters = 0
    for i in range(0, maxIter):
        iters += 1

        cos_lambda = cos(Lambda)
        sin_lambda = sin(Lambda)
        sin_sigma = sqrt(
            (cos_u2 * sin(Lambda)) ** 2
            + (cos_u1 * sin_u2 - sin_u1 * cos_u2 * cos_lambda) ** 2
        ) + 1e-8
        cos_sigma = sin_u1 * sin_u2 + cos_u1 * cos_u2 * cos_lambda
        sigma = atan2(sin_sigma, cos_sigma)
        sin_alpha = (cos_u1 * cos_u2 * sin_lambda) / sin_sigma
        cos_sq_alpha = 1 - sin_alpha**2
        cos2_sigma_m = cos_sigma - ((2 * sin_u1 * sin_u2) / cos_sq_alpha)
        C = (f / 16) * cos_sq_alpha * (4 + f * (4 - 3 * cos_sq_alpha))
        Lambda_prev = Lambda
        Lambda = L + (1 - C) * f * sin_alpha * (
            sigma
            + C
            * sin_sigma
            * (cos2_sigma_m + C * cos_sigma * (-1 + 2 * cos2_sigma_m**2))
        )

        # successful convergence
        diff = abs(Lambda_prev - Lambda)
        if diff <= tol:
            break

    u_sq = cos_sq_alpha * ((a**2 - b**2) / b**2)
    A = 1 + (u_sq / 16384) * (4096 + u_sq * (-768 + u_sq * (320 - 175 * u_sq)))
    B = (u_sq / 1024) * (256 + u_sq * (-128 + u_sq * (74 - 47 * u_sq)))
    delta_sig = (
        B
        * sin_sigma
        * (
            cos2_sigma_m
            + 0.25
            * B
            * (
                cos_sigma * (-1 + 2 * cos2_sigma_m**2)
                - (1 / 6)
                * B
                * cos2_sigma_m
                * (-3 + 4 * sin_sigma**2)
                * (-3 + 4 * cos2_sigma_m**2)
            )
        )
    )

    return b * A * (sigma - delta_sig)

In [8]:
distances = []
for i, row in tqdm(df.iterrows()):
    distances.append(vincenty_inverse((row['Start_Lng'], row['Start_Lat']), (row['End_Lng'], row['End_Lat'])))

df['accident_distance'] = distances

2845342it [00:55, 51701.51it/s]


In [9]:
cities_lat = cities['lat'].values
cities_lng = cities['lng'].values

start_lat = df["Start_Lng"].values
start_lng = df['Start_Lat'].values

end_lat = df['End_Lat'].values
end_lng = df['End_Lng'].values

cities_cor = list(zip(cities_lat, cities_lng))

In [10]:
from math import radians
from sklearn.neighbors import BallTree
import numpy as np

cities_cor_rad = np.array([[radians(x[0]), radians(x[1])] for x in cities_cor ])
tree = BallTree(cities_cor_rad , metric = 'haversine')



In [11]:
min_distance_start = []
min_distance_end = []
earth_radius = 6371

for i in tqdm(range(df.shape[0])):
    dist_start = 1e9
    dist_end = 1e9
    
    start_result = tree.query([(radians(start_lat[i]),radians(start_lng[i]))])
    end_result = tree.query([(radians(end_lat[i]),radians(end_lng[i]))])
            
    min_distance_start.append((start_result[0][0] * earth_radius)[0])
    min_distance_end.append((end_result[0][0] * earth_radius)[0])

100%|██████████| 2845342/2845342 [02:32<00:00, 18640.35it/s]


In [12]:
df['start_dist_city'] = min_distance_start
df['end_dist_city'] = min_distance_end

In [13]:
to_encode = ['State', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
col_to_drop = ['Number', 'Street', 'City', 'County', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Weather_Condition', 'Start_Time', 'End_Time', 'Description']

In [14]:
df = df.drop(columns=col_to_drop)

In [15]:
df = df.dropna()

In [16]:
for feature_to_encode in to_encode:
    one_hot = pd.get_dummies(df[feature_to_encode])
    one_hot = one_hot.add_prefix(f'{feature_to_encode}_')
    df = df.drop([feature_to_encode], axis=1)
    df = pd.concat([df, one_hot], axis=1)

In [17]:
df.to_csv('cleaned_accidents.csv', index=False)

In [1]:
import pandas as pd

In [3]:
! ls 

README.MD           preprocessing.ipynb


In [2]:
dt = pd.read_csv('cleaned_accidents.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_accidents.csv'