In [3]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
# Step 1: Load the dataset

# Load the dataset into a pandas DataFrame
df = pd.read_csv('NYPD_Motor_Vehicle_Collisions_1000.csv') 
df.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,03/15/2016,0:30,MANHATTAN,10003.0,40.73527,-73.982346,"(40.73527, -73.982346)",,,300 EAST 19 STREET,...,Unspecified,,,,3406829,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
1,06/25/2013,13:40,,,,,,,,,...,Unspecified,,,,2879485,PASSENGER VEHICLE,LARGE COM VEH(6 OR MORE TIRES),,,
2,09/15/2012,23:10,QUEENS,11418.0,40.695822,-73.821102,"(40.6958225, -73.8211022)",ATLANTIC AVENUE,127 STREET,,...,Unspecified,,,,204319,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
3,03/14/2018,10:00,,,40.699017,-73.80696,"(40.699017, -73.80696)",SUTPHIN BOULEVARD,,,...,Driver Inattention/Distraction,,,,3862996,PASSENGER VEHICLE,BU,,,
4,06/06/2018,19:45,BRONX,10454.0,40.80777,-73.91023,"(40.80777, -73.91023)",CONCORD AVENUE,EAST 142 STREET,,...,Unspecified,,,,3915520,Carry All,Sedan,,,


In [5]:
# Step 2: Convert Categorical Variables to Binary Variables (One-Hot Encoding)
# We are interested in two columns: 'BOROUGH' and 'CONTRIBUTING FACTOR VEHICLE 1'

# Fill any missing values in 'BOROUGH' and 'CONTRIBUTING FACTOR VEHICLE 1' with 'Unknown'
df['BOROUGH'] = df['BOROUGH'].fillna('Unknown')
df['CONTRIBUTING FACTOR VEHICLE 1'] = df['CONTRIBUTING FACTOR VEHICLE 1'].fillna('Unknown')



In [6]:
# Convert the categorical columns into dummy variables
df_encoded = pd.get_dummies(df, columns=['BOROUGH', 'CONTRIBUTING FACTOR VEHICLE 1'], drop_first=True)


In [7]:
# Step 3: Data Splitting for Supervised Learning
# Partitioning the data into Training (60%), Validation (25%), and Test (15%) sets

# First, we'll split the data into a temporary train+validation and test set
train_valid_set, test_set = train_test_split(df_encoded, test_size=0.15, random_state=42)

# Now, we'll split the train+validation set into train and validation sets
train_set, validation_set = train_test_split(train_valid_set, test_size=0.294, random_state=42)  # 0.294 gives a 25% of the original data

# Confirm the sizes of the datasets
print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(validation_set)}")
print(f"Test set size: {len(test_set)}")

# Display the transformed dataset in Jupyter Notebook
df_encoded.head()  # Display the first 5 rows of the dataset

Training set size: 600
Validation set size: 250
Test set size: 150


Unnamed: 0,DATE,TIME,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,...,CONTRIBUTING FACTOR VEHICLE 1_Prescription Medication,CONTRIBUTING FACTOR VEHICLE 1_Reaction to Other Uninvolved Vehicle,CONTRIBUTING FACTOR VEHICLE 1_Reaction to Uninvolved Vehicle,CONTRIBUTING FACTOR VEHICLE 1_Traffic Control Disregarded,CONTRIBUTING FACTOR VEHICLE 1_Turning Improperly,CONTRIBUTING FACTOR VEHICLE 1_Unknown,CONTRIBUTING FACTOR VEHICLE 1_Unsafe Lane Changing,CONTRIBUTING FACTOR VEHICLE 1_Unsafe Speed,CONTRIBUTING FACTOR VEHICLE 1_Unspecified,CONTRIBUTING FACTOR VEHICLE 1_View Obstructed/Limited
0,03/15/2016,0:30,10003.0,40.73527,-73.982346,"(40.73527, -73.982346)",,,300 EAST 19 STREET,0.0,...,False,False,False,False,False,False,False,False,False,False
1,06/25/2013,13:40,,,,,,,,0.0,...,False,False,False,False,False,False,False,False,True,False
2,09/15/2012,23:10,11418.0,40.695822,-73.821102,"(40.6958225, -73.8211022)",ATLANTIC AVENUE,127 STREET,,0.0,...,False,False,False,False,False,False,False,False,False,False
3,03/14/2018,10:00,,40.699017,-73.80696,"(40.699017, -73.80696)",SUTPHIN BOULEVARD,,,1.0,...,False,False,False,False,False,False,False,False,False,False
4,06/06/2018,19:45,10454.0,40.80777,-73.91023,"(40.80777, -73.91023)",CONCORD AVENUE,EAST 142 STREET,,0.0,...,False,False,False,False,False,False,False,False,False,False
