In [6]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import os

# Load crash dataset
df = pd.read_csv('/content/drive/MyDrive/merged_dataset.csv')
# Drop rows with missing key values
df = df.dropna(subset=[
    'POSTED_SPEED_LIMIT', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
    'ROADWAY_SURFACE_COND', 'DAMAGE', 'NUM_UNITS', 'CRASH_HOUR',
    'FIRST_CRASH_TYPE', 'INJURIES_TOTAL'
])

# Parse DAMAGE to numeric
def parse_damage(val):
    if isinstance(val, str):
        val = val.replace(',', '')
        if 'OVER' in val:
            return 2000
        elif 'LESS' in val:
            return 500
        elif '-' in val:
            parts = val.replace('$', '').split(' - ')
            return (int(parts[0]) + int(parts[1])) / 2
    return 1000  # fallback

df['DAMAGE_VALUE'] = df['DAMAGE'].apply(parse_damage)

# Create RISK_LEVEL label
def assign_risk(row):
    if row['INJURIES_TOTAL'] >= 1:
        return 'High'
    elif row['DAMAGE_VALUE'] >= 1500:
        return 'Medium'
    else:
        return 'Low'

df['RISK_LEVEL'] = df.apply(assign_risk, axis=1)

# Select features and label
features = [
    'POSTED_SPEED_LIMIT', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
    'ROADWAY_SURFACE_COND', 'DAMAGE_VALUE', 'NUM_UNITS', 'CRASH_HOUR',
    'FIRST_CRASH_TYPE'
]
label = 'RISK_LEVEL'
df = df[features + [label]]

# Encode categorical features
encoders = {}
for col in ['WEATHER_CONDITION', 'LIGHTING_CONDITION', 'ROADWAY_SURFACE_COND', 'FIRST_CRASH_TYPE']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Encode target label
target_encoder = LabelEncoder()
df[label] = target_encoder.fit_transform(df[label])

# Train-test split
X = df[features]
y = df[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, target_names=target_encoder.classes_)
print("Classification Report:\n", report)

# Save model and encoders
os.makedirs("risk_model", exist_ok=True)
joblib.dump(clf, "risk_model/model.pkl")
joblib.dump(encoders, "risk_model/encoders.pkl")
joblib.dump(target_encoder, "risk_model/target_encoder.pkl")

print("Model training complete. Artifacts saved in 'risk_model' folder.")



  df = pd.read_csv('/content/drive/MyDrive/merged_dataset.csv')


Classification Report:
               precision    recall  f1-score   support

        High       0.55      0.57      0.56    130776
         Low       0.93      0.98      0.95    167254
      Medium       0.85      0.81      0.83    297394

    accuracy                           0.80    595424
   macro avg       0.78      0.79      0.78    595424
weighted avg       0.81      0.80      0.80    595424

Model training complete. Artifacts saved in 'risk_model' folder.


In [None]:
# --- Preprocessing ---
# Drop rows with essential missing values
df = df.dropna(subset=[
    'INJURIES_FATAL', 'INJURIES_TOTAL', 'DAMAGE',
    'WEATHER_CONDITION', 'ROADWAY_SURFACE_COND',
    'LIGHTING_CONDITION', 'POSTED_SPEED_LIMIT'
])

# Clean DAMAGE column into a numeric format
def parse_damage(val):
    if 'OVER' in val:
        return 2000
    elif 'LESS' in val:
        return 500
    elif '-' in val:
        return sum(map(int, val.replace('$', '').split(' - '))) / 2
    else:
        return 1000

df['DAMAGE_VALUE'] = df['DAMAGE'].apply(parse_damage)

# Create risk level label
def label_risk(row):
    if row['INJURIES_FATAL'] > 0:
        return 'High'
    elif row['INJURIES_TOTAL'] > 0 or row['DAMAGE_VALUE'] >= 1500:
        return 'Medium'
    else:
        return 'Low'

df['RISK_LEVEL'] = df.apply(label_risk, axis=1)

# --- Feature selection ---
features = ['WEATHER_CONDITION', 'LIGHTING_CONDITION', 'ROADWAY_SURFACE_COND',
            'POSTED_SPEED_LIMIT', 'DAMAGE_VALUE']
df = df[features + ['RISK_LEVEL']]

# --- Encode categorical variables ---
encoders = {}
for col in ['WEATHER_CONDITION', 'LIGHTING_CONDITION', 'ROADWAY_SURFACE_COND']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Encode target
target_encoder = LabelEncoder()
df['RISK_LEVEL'] = target_encoder.fit_transform(df['RISK_LEVEL'])

# --- Train the model ---
X = df[features]
y = df['RISK_LEVEL']

clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X, y)

# --- Save model and encoders ---
joblib.dump(clf, 'model.pkl')
joblib.dump(encoders, 'encoder.pkl')
joblib.dump(target_encoder, 'target_encoder.pkl')

print("✅ Model and encoders saved successfully.")