In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [2]:
df = pd.read_csv("../Data_Raw/full_data/mdtraffic.csv")

In [3]:
col         = 'Severity'
conditions  = [ df[col] == 1, df[col] == 2, df[col] == 3, df[col] == 4 ]
choices     = [ 0, 0, 0, 1 ]
df["Severity"] = np.select(conditions, choices)

In [4]:
c1 = df[df["Severity"] == 1]
c0 = df[df["Severity"] == 0]

print(c0.shape)
print(c1.shape)

(129525, 46)
(10892, 46)


In [5]:
c0_downsamp = resample(c0,
                       replace=True,
                       n_samples=len(c1),
                       random_state=49209)

# Confirm class sizes are same size
print(c0_downsamp.shape)
print(c1.shape)

(10892, 46)
(10892, 46)


In [6]:
df = pd.concat([c0_downsamp, c1])

# reset index
df.reset_index(drop=True, inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
df_new = df[['Severity', 'ID', 'Source', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight']]

X = df_new.iloc[:, 1:]
y = df_new.Severity

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 49209,
                                                    stratify = y)

In [11]:
# Export training set to one folder...
X_train.to_csv('../Data_Raw/Class4_vs_123_ML_Data/train/X_train_4_v_123.csv', index = False)
y_train.to_csv('../Data_Raw/Class4_vs_123_ML_Data/train/y_train_4_v_123.csv', index = False)

# ...And testing set to another
X_test.to_csv('../Data_Raw/Class4_vs_123_ML_Data/test/X_test_4_v_123.csv', index = False)
y_test.to_csv('../Data_Raw/Class4_vs_123_ML_Data/test/y_test_4_v_123.csv', index = False)