# **Phase 2**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score
from sklearn.impute import SimpleImputer
import seaborn as sb
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.svm import SVC
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_rand_score



In [None]:
df = pd.read_csv('path to file/cleaned_dataset.csv')

# If you need to sample the data
# df = df.sample(frac=0.25, random_state=42)
# df = df.sample(frac=0.25, random_state=42)

#Feature Encoding
df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
df['DayOfWeek'] = df['CRASH DATE'].dt.dayofweek  # Monday=0, Sunday=6
df['CRASH TIME'] = pd.to_datetime(df['CRASH TIME'], format='%H:%M')
df['HourOfDay'] = df['CRASH TIME'].dt.hour  # Extract hour (0-23)

df = pd.get_dummies(df, columns=['BOROUGH'], drop_first=True)  # One-hot encoding (give values 0 or 1 to the categorical values) for boroughs
df['ZIP CODE'] = pd.to_numeric(df['ZIP CODE'], errors='coerce')  # zip to numeric



for col in ['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3']:
    df[col].fillna("Unknown", inplace=True)  # missing values marked as "Unknown"
    df[col] = LabelEncoder().fit_transform(df[col])



df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df.replace('', float('nan'), inplace=True)

df['ZIP CODE'] = pd.to_numeric(df['ZIP CODE'], errors='coerce')

features_clustering = [
    'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
    'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
    'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
    'DayOfWeek', 'HourOfDay',
    'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
    'CONTRIBUTING FACTOR VEHICLE 3'
]

# Ensure df_filtered has only these 13 columns
df_filtered = df[features_clustering].dropna()


scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_filtered)

In [None]:
df['SEVERE_ACCIDENT'] = (df['NUMBER OF PERSONS INJURED'] > 0) | (df['NUMBER OF PERSONS KILLED'] > 0)
df['SEVERE_ACCIDENT'] = df['SEVERE_ACCIDENT'].astype(int)

df['BOROUGH'] = df['BOROUGH_BROOKLYN'] | df['BOROUGH_MANHATTAN'] | df['BOROUGH_QUEENS'] | df['BOROUGH_STATEN ISLAND']

df['CRASH TIME'] = pd.to_datetime(df['CRASH TIME'])
df['CRASH HOUR'] = df['CRASH TIME'].dt.hour

features = [
    'CONTRIBUTING FACTOR VEHICLE 1', 'BOROUGH', 'CRASH HOUR',
    'VEHICLE TYPE CODE 1', 'LATITUDE', 'LONGITUDE'
]
X = df[features]
y = df['SEVERE_ACCIDENT']

# Preprocessing pipeline
numeric_features = ['CRASH HOUR', 'LATITUDE', 'LONGITUDE']
categorical_features = ['CONTRIBUTING FACTOR VEHICLE 1', 'BOROUGH', 'VEHICLE TYPE CODE 1']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_processed = preprocessor.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [None]:
#1 Logistic Regression

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

lr = LogisticRegression(max_iter=1000)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train_imputed, y_train)

best_lr = grid_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test_imputed)
confusion_lr = confusion_matrix(y_test, y_pred_lr)

print("Logistic Regression Best Parameters:", grid_lr.best_params_)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression ROC AUC:", roc_auc_score(y_test, best_lr.predict_proba(X_test_imputed)[:, 1]))
print("Logistic Regression F1 Score:", f1_score(y_test, y_pred_lr))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_lr, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#better accuracy with the following:
##LOGISTIC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score
from sklearn.impute import SimpleImputer
import seaborn as sb

lr_params = {
    'C': np.logspace(-3,3,7),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'class_weight': ['balanced', None]
}

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_test_imputed)
confusion =confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test_imputed)[:, 1]))
print("F1 Score:", f1_score(y_test, y_pred))

plt.figure(figsize=(8, 6))
sb.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
#2 Decision Tree
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
param_grid_dt = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier()
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')
grid_dt.fit(X_train_imputed, y_train)

best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test_imputed)
confusion_dt = confusion_matrix(y_test, y_pred_dt)

# Display results
print("Decision Tree Best Parameters:", grid_dt.best_params_)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree F1 Score:", f1_score(y_test, y_pred_dt))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_dt, annot=True, fmt='d', cmap='Greens')
plt.title('Decision Tree Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


#better accuracy with the following
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_test_imputed)
confusion =confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

plt.figure(figsize=(8, 6))
sb.heatmap(confusion, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
#3 XGB BOOST

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(X_train_imputed, y_train)
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test_imputed)
confusion_xgb = confusion_matrix(y_test, y_pred_xgb)

# Display results
print("XGBoost Best Parameters:", grid_xgb.best_params_)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost F1 Score:", f1_score(y_test, y_pred_xgb))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_xgb, annot=True, fmt='d', cmap='Purples')
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
#4 DBSCAN

# Converting the scaled NumPy array back to a DataFrame
df_scaled_df = pd.DataFrame(df_scaled, columns=df_filtered.columns)

# sampling the data
df_scaled_subset = df_scaled_df.sample(frac=0.1, random_state=42)

#parameters for scaled data
#dbscan = DBSCAN(eps=0.3, min_samples=10)
# df_scaled_subset = df_scaled.sample(frac=0.1, random_state=42)

eps_values = [0.5, 0.75]
min_samples_values = [15, 25, 50, 75, 100]

best_score = -1
best_params = {}
best_labels = None

# Create a DataFrame to store silhouette scores
scores_df = pd.DataFrame(index=eps_values, columns=min_samples_values)

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(df_scaled_subset)
        unique_labels = np.unique(labels)
        # Check if a meaningful clustering is found (more than 1 cluster)
        if len(unique_labels) > 1 and len(unique_labels) < len(df_scaled_subset):
            score = silhouette_score(df_scaled_subset, labels)
            scores_df.loc[eps, min_samples] = score
            print(f"eps: {eps}, min_samples: {min_samples}, silhouette score: {score:.3f}")
            if score > best_score:
                best_score = score
                best_params = {'eps': eps, 'min_samples': min_samples}
                best_labels = labels
        else:
            scores_df.loc[eps, min_samples] = np.nan

print("Best DBSCAN Parameters:", best_params)
print("Best Silhouette Score:", best_score)

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(scores_df, annot=True, cmap='viridis', cbar_kws={'label': 'Silhouette Score'})
plt.title('Silhouette Scores for Different eps and min_samples Values')
plt.xlabel('min_samples')
plt.ylabel('eps')
plt.show()

In [None]:
#5 Neural Networks

#takes around 40 minutes on GPU

#if this takes too long, try the following:
#model = keras.Sequential([
#    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
#    layers.Dropout(0.2),
#    layers.Dense(32, activation='relu'),
#    layers.Dense(1, activation='sigmoid')
#])
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#model.fit(X_train, y_train, epochs=10, validation_split=0.2)

def build_model(hp):
    model = keras.Sequential()
    # Tune number of units in the first layer
    model.add(layers.Dense(units=hp.Int('units_1', min_value=32, max_value=128, step=32),
                           activation='relu', input_shape=(X_train.shape[1],)))
    # Tune dropout rate
    model.add(layers.Dropout(rate=hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)))
    # Tune number of units in the second layer
    model.add(layers.Dense(units=hp.Int('units_2', min_value=16, max_value=64, step=16),
                           activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=2,
                     factor=2,
                     directory='nn_tuning',
                     project_name='tune_nn',
                     hyperband_iterations=1,
                     overwrite = True)

tuner.search(X_train, y_train, epochs=10, validation_split=0.2, verbose=1)

best_nn_model = tuner.get_best_models(num_models=1)[0]
loss, nn_accuracy = best_nn_model.evaluate(X_test, y_test)
print("Neural Network Best Hyperparameters:", tuner.get_best_hyperparameters()[0].values)
print("Neural Network Accuracy:", nn_accuracy)

In [None]:
#6 KMEANS
inertia = []
K_range = range(2, 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow graph
plt.plot(K_range, inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

# Reduce dimensions to 2D for visualization
#pca = PCA(n_components=2)
#df_pca = pca.fit_transform(df_scaled)

k_range = range(3, 10)
scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_scaled)
    score = silhouette_score(df_scaled, labels)
    scores.append({k: score})
    print(f"Silhouette score for k={k}: {score:.3f}")

optimal_k = max(scores, key=lambda x: list(x.values())[0])
optimal_k = list(optimal_k.keys())[0]
print(f"Optimal k: {optimal_k}")

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans.fit(df_scaled)

# Reduce dimensions to 2D for visualization
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)

# Add cluster labels to PCA dataframe
df_plot = pd.DataFrame(df_pca, columns=['PC1', 'PC2'])
df_plot['Cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(10, 8))
for cluster in range(optimal_k):
    cluster_data = df_plot[df_plot['Cluster'] == cluster]
    plt.scatter(cluster_data['PC1'], cluster_data['PC2'], label=f'Cluster {cluster}')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Clusters Visualization')
plt.legend()
plt.show()


In [None]:
#7 KNN
from sklearn.neighbors import KNeighborsClassifier  # New import

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and train KNN classifier
model = KNeighborsClassifier(
    n_neighbors=10,    # Number of neighbors to consider
    weights='uniform', # 'uniform' or 'distance' based weighting
    algorithm='auto'  # Auto-choose best algorithm
)
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_test_imputed)

# Generate confusion matrix
confusion = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sb.heatmap(confusion, annot=True, fmt='d', cmap='Reds')
plt.title('Confusion Matrix - KNN')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
#8 Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier  # New import

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and train Gradient Boosting model
model = GradientBoostingClassifier(
    n_estimators=100,  # Number of boosting stages
    learning_rate=0.1,  # Shrinkage factor
    max_depth=3,       # Maximum depth of individual trees
    random_state=42
)
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_test_imputed)

# Generate confusion matrix
confusion = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# Plot confusion matrix (same visualization)
plt.figure(figsize=(8, 6))
sb.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Gradient Boosting')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
#9 SVC

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

param_grid_svc = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1, 10],
    'kernel': ['rbf', 'linear']
}

svc = SVC(probability=True)
grid_svc = GridSearchCV(svc, param_grid_svc, cv=5, scoring='accuracy')
grid_svc.fit(X_train_imputed, y_train)

best_svc = grid_svc.best_estimator_
y_pred_svc = best_svc.predict(X_test_imputed)
confusion_svc = confusion_matrix(y_test, y_pred_svc)

print("SVC Best Parameters:", grid_svc.best_params_)
print("SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("SVC F1 Score:", f1_score(y_test, y_pred_svc))


plt.figure(figsize=(8, 6))
sns.heatmap(confusion_svc, annot=True, fmt='d', cmap='Reds')
plt.title('SVC Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
drive.mount('/content/drive')
plt.show()