In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("US_Accidents_May19.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
#df = df.dropna()
df.head()
df.shape

(2243939, 49)

In [3]:
df.count()


ID                       2243939
Source                   2243939
TMC                      1727177
Severity                 2243939
Start_Time               2243939
End_Time                 2243939
Start_Lat                2243939
Start_Lng                2243939
End_Lat                   516762
End_Lng                   516762
Distance(mi)             2243939
Description              2243938
Number                    785537
Street                   2243939
Side                     2243939
City                     2243871
County                   2243939
State                    2243939
Zipcode                  2243293
Country                  2243939
Timezone                 2241798
Airport_Code             2220275
Weather_Timestamp        2196769
Temperature(F)           2181674
Wind_Chill(F)             391569
Humidity(%)              2179472
Pressure(in)             2186659
Visibility(mi)           2172579
Wind_Direction           2196749
Wind_Speed(mph)          1800985
Precipitat

In [4]:
#remove columns with the least amount of information
df1=df.drop([ "Number", "Wind_Chill(F)", "Precipitation(in)", "Airport_Code", "Weather_Timestamp", "ID",
             "Source", "Description", "TMC", "End_Time", "Distance(mi)", "Zipcode", "Country", "City", 
             "County", "Timezone", "Civil_Twilight","Nautical_Twilight","Astronomical_Twilight", "Street",
             "Amenity", "Bump", "Crossing", "Give_Way" , "Give_Way", "No_Exit", "Railway", "Roundabout", "Station",
             "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop", "Sunrise_Sunset", "End_Lat", "End_Lng" , "Junction"], axis=1)
             
            




In [5]:
df1 = df1.dropna()
df1.count()


Severity             1780210
Start_Time           1780210
Start_Lat            1780210
Start_Lng            1780210
Side                 1780210
State                1780210
Temperature(F)       1780210
Humidity(%)          1780210
Pressure(in)         1780210
Visibility(mi)       1780210
Wind_Direction       1780210
Wind_Speed(mph)      1780210
Weather_Condition    1780210
dtype: int64

In [8]:
df1.head()
df1.to_csv("maps_data.csv",index=False)


In [7]:
df1.to_json("maps.json", orient='records')

In [9]:
#list(df.columns) 
feature_names = df1.columns
df1.dtypes

Severity               int64
Start_Time            object
Start_Lat            float64
Start_Lng            float64
Side                  object
State                 object
Temperature(F)       float64
Humidity(%)          float64
Pressure(in)         float64
Visibility(mi)       float64
Wind_Direction        object
Wind_Speed(mph)      float64
Weather_Condition     object
dtype: object

# Select your features (columns)

In [11]:
# Set features. This will also be used as your x values.
X = df1.drop(["Severity", "State", "Start_Time","Start_Lat", "Start_Lng"],axis=1)

#removing features did not result in a better score

# Select Y


In [12]:
y = df1["Severity"].values.reshape(-1, 1)


In [13]:
print(X.shape, y.shape)


(1780210, 8) (1780210, 1)


### Dummy Encoding (Binary Encoded Data)

In [14]:
data = X.copy()
X = pd.get_dummies(data)
X.head()

Unnamed: 0,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Side_,Side_L,Side_R,Wind_Direction_ENE,Wind_Direction_ESE,...,Weather_Condition_Small Hail,Weather_Condition_Smoke,Weather_Condition_Snow,Weather_Condition_Snow Grains,Weather_Condition_Snow Showers,Weather_Condition_Squalls,Weather_Condition_Thunderstorm,Weather_Condition_Thunderstorms and Rain,Weather_Condition_Thunderstorms and Snow,Weather_Condition_Widespread Dust
2,36.0,100.0,29.67,10.0,3.5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,35.1,96.0,29.64,9.0,4.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36.0,89.0,29.65,6.0,3.5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,37.9,97.0,29.63,7.0,3.5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,34.0,100.0,29.66,7.0,3.5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Create a Train Test Split



In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_train.head()

Unnamed: 0,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Side_,Side_L,Side_R,Wind_Direction_ENE,Wind_Direction_ESE,...,Weather_Condition_Small Hail,Weather_Condition_Smoke,Weather_Condition_Snow,Weather_Condition_Snow Grains,Weather_Condition_Snow Showers,Weather_Condition_Squalls,Weather_Condition_Thunderstorm,Weather_Condition_Thunderstorms and Rain,Weather_Condition_Thunderstorms and Snow,Weather_Condition_Widespread Dust
1230997,77.0,31.0,29.92,10.0,24.2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
964138,90.0,54.0,30.14,9.0,8.1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
761591,50.0,83.0,30.34,10.0,10.4,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1847986,72.0,59.0,29.94,10.0,3.5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1478272,57.9,100.0,29.85,6.0,4.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [51]:
# # Scale your data
# from sklearn.preprocessing import MinMaxScaler
# X_minmax = MinMaxScaler().fit(X_train)

# X_train_minmax = X_minmax.transform(X_train)
# X_test_minmax = X_minmax.transform(X_test)



# Train the Model



In [16]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)


In [17]:
import numpy as np

rf = rf.fit(X_train, np.ravel(y_train,order='C'))
rf 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")

Training Data Score: 0.931873929433018
Testing Data Score: 0.6503225458540893


In [19]:
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)

[(0.2601296478759309, 'Temperature(F)'),
 (0.2481243583192499, 'Pressure(in)'),
 (0.21276637684156202, 'Humidity(%)'),
 (0.1301844511019918, 'Wind_Speed(mph)'),
 (0.03988011123754236, 'Side_L'),
 (0.03571833146162133, 'Visibility(mi)'),
 (0.025024570129207408, 'Side_R'),
 (0.002313698172092136, 'Wind_Direction_South'),
 (0.0021877898932737452, 'Wind_Direction_SSW'),
 (0.0021637132225179848, 'Weather_Condition_Mostly Cloudy'),
 (0.002162499794928309, 'Weather_Condition_Scattered Clouds'),
 (0.0021614410734421457, 'Weather_Condition_Overcast'),
 (0.0021376758877517023, 'Wind_Direction_SW'),
 (0.0021156766696940566, 'Wind_Direction_SE'),
 (0.002078077071024249, 'Wind_Direction_ESE'),
 (0.0020414388156775034, 'Wind_Direction_ENE'),
 (0.0020192110192954952, 'Wind_Direction_NE'),
 (0.0020017696115390954, 'Wind_Direction_North'),
 (0.0019896783190500268, 'Wind_Direction_NNW'),
 (0.0019196089906778056, 'Wind_Direction_NW'),
 (0.0019104442340996952, 'Weather_Condition_Partly Cloudy'),
 (0.00185