In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset
data = pd.read_csv('AccidentsFull.csv')

# Create the INJURY variable
data['INJURY'] = data['MAX_SEV_IR'].apply(lambda x: 'Yes' if x in [1, 2] else 'No')

# Check the majority class
majority_class = data['INJURY'].value_counts().idxmax()
print(f"If no information is available, predict: {majority_class}")

# Select the first 12 records with relevant columns
subset_data = data.iloc[:12][['INJURY', 'WEATHER_R', 'TRAF_CON_R']]

# Part (a): Create a pivot table
pivot_table = pd.pivot_table(
    subset_data, 
    values='INJURY', 
    index='WEATHER_R', 
    columns='TRAF_CON_R', 
    aggfunc=lambda x: (x == 'Yes').sum()
).fillna(0)
print("\nPart (a): Pivot Table")
print(pivot_table)

# Part (b): Calculate exact Bayes conditional probabilities
subset_data['COUNT'] = 1
combination_counts = subset_data.groupby(['WEATHER_R', 'TRAF_CON_R'])['COUNT'].sum()
injury_yes_counts = subset_data[subset_data['INJURY'] == 'Yes'].groupby(['WEATHER_R', 'TRAF_CON_R'])['COUNT'].sum()
bayes_probabilities = (injury_yes_counts / combination_counts).fillna(0).reset_index(name='P(INJURY=Yes|WEATHER_R, TRAF_CON_R)')
print("\nPart (b): Bayes Probabilities")
print(bayes_probabilities)

# Part (c): Classify using Bayes probabilities and a cutoff of 0.5
subset_data = subset_data.merge(bayes_probabilities, on=['WEATHER_R', 'TRAF_CON_R'], how='left')
subset_data['CLASSIFICATION'] = subset_data['P(INJURY=Yes|WEATHER_R, TRAF_CON_R)'].apply(lambda x: 'Yes' if x >= 0.5 else 'No')
print("\nPart (c): Classifications")
print(subset_data[['WEATHER_R', 'TRAF_CON_R', 'P(INJURY=Yes|WEATHER_R, TRAF_CON_R)', 'CLASSIFICATION']])

# Part (d): Naive Bayes conditional probability for WEATHER_R = 1 and TRAF_CON_R = 1
weather_condition = subset_data[(subset_data['WEATHER_R'] == 1) & (subset_data['TRAF_CON_R'] == 1)]
injury_yes_count = weather_condition[weather_condition['INJURY'] == 'Yes'].shape[0]
total_count = weather_condition.shape[0]
probability_naive = injury_yes_count / total_count if total_count > 0 else 0
print(f"\nPart (d): Naive Bayes Probability (WEATHER_R=1, TRAF_CON_R=1): {probability_naive}")

# Part (e): Naive Bayes classification using scikit-learn
nb_classifier = CategoricalNB()
X_subset = subset_data[['WEATHER_R', 'TRAF_CON_R']].astype(str)  # Convert to categorical
y_subset = subset_data['INJURY']
nb_classifier.fit(X_subset, y_subset)
predicted_probabilities = nb_classifier.predict_proba(X_subset)
predicted_classes = nb_classifier.predict(X_subset)
subset_data['SCIKIT_PROB_YES'] = predicted_probabilities[:, 1]
subset_data['SCIKIT_CLASSIFICATION'] = predicted_classes
print("\nPart (e): Scikit-learn Classifications")
print(subset_data[['WEATHER_R', 'TRAF_CON_R', 'SCIKIT_PROB_YES', 'SCIKIT_CLASSIFICATION']])

# Partition the data into training (60%) and validation (40%) sets
X = data[['WEATHER_R', 'TRAF_CON_R']].astype(str)  # Only categorical predictors
y = data['INJURY']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)

# Part (f): Use relevant predictors (e.g., WEATHER_R and TRAF_CON_R)

# Part (g): Train Naive Bayes on the complete training set
nb_full_classifier = CategoricalNB()
nb_full_classifier.fit(X_train, y_train)
y_val_pred = nb_full_classifier.predict(X_val)

# Confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("\nPart (g): Confusion Matrix")
print(conf_matrix)

# Part (h): Overall error for validation set
validation_error = 1 - accuracy_score(y_val, y_val_pred)
print(f"\nPart (h): Overall Error: {validation_error:.4f}")

# Part (i): Percent improvement over naive rule
naive_rule_accuracy = max(y_train.value_counts()) / len(y_train)
improvement = (accuracy_score(y_val, y_val_pred) - naive_rule_accuracy) / naive_rule_accuracy * 100
print(f"\nPart (i): Percent Improvement Over Naive Rule: {improvement:.2f}%")

# Part (j): Conditional probabilities and explanation for P(INJURY = No | SPD_LIM = 5)
print("\nPart (j): Explanation")
print("If SPD_LIM = 5 has no occurrences of 'INJURY = No', the conditional probability is 0 due to lack of data for this specific condition.")

If no information is available, predict: Yes

Part (a): Pivot Table
TRAF_CON_R    0    1    2
WEATHER_R                
1           2.0  0.0  0.0
2           1.0  0.0  0.0

Part (b): Bayes Probabilities
   WEATHER_R  TRAF_CON_R  P(INJURY=Yes|WEATHER_R, TRAF_CON_R)
0          1           0                             0.666667
1          1           1                             0.000000
2          1           2                             0.000000
3          2           0                             0.166667
4          2           1                             0.000000

Part (c): Classifications
    WEATHER_R  TRAF_CON_R  P(INJURY=Yes|WEATHER_R, TRAF_CON_R) CLASSIFICATION
0           1           0                             0.666667            Yes
1           2           0                             0.166667             No
2           2           1                             0.000000             No
3           1           1                             0.000000             No
4       