In [1]:
# Install all the required libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import joblib

In [3]:
# Merge the split CSV files
encoded_parts = ['output_chunks/clean_df_3_single_weapon_part1.csv', 'output_chunks/clean_df_3_single_weapon_part2.csv']
dataframes = [pd.read_csv(part) for part in encoded_parts]
df_encoded =  pd.concat(dataframes, ignore_index=True)

In [4]:
# Check data is correct
print("First 5 rows:")
df_encoded.head()

First 5 rows:


Unnamed: 0,Agency Type,Victim Sex,Victim Age,Victim Ethnicity,Perpetrator Sex,Perpetrator Ethnicity,Weapon Category,Relationship Category,Region,Season
0,Other Police,Female,26,Not Hispanic,Male,Not Hispanic,Non-Firearm,Lover,West,Autumn
1,Sheriff,Male,23,Not Hispanic,Male,Not Hispanic,Firearm,Acquaintance,South,Summer
2,Sheriff,Male,42,Not Hispanic,Female,Not Hispanic,Firearm,Lover,South,Summer
3,Sheriff,Male,33,Not Hispanic,Male,Not Hispanic,Firearm,Acquaintance,South,Summer
4,Sheriff,Male,46,Not Hispanic,Male,Not Hispanic,Firearm,Family,South,Autumn


In [6]:
## Feature Engineering
#1. Encoding categorical variables using Label Encoding
#2. Creating a binary target variable for weapon type
#3. Scaling numerical features


In [7]:
# Initialize encoders for categorical columns
categorical_columns = [
    'Agency Type', 'Victim Sex', 'Victim Age', 'Victim Ethnicity',
    'Perpetrator Sex', 'Perpetrator Ethnicity',
    'Relationship Category', 'Region', 'Season'
]

# Initialize label encoders
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# Force binary mapping for weapon category
weapon_mapping = {'Non-Firearm': 0, 'Firearm': 1}
df_encoded['Weapon Category'] = df_encoded['Weapon Category'].map(weapon_mapping)

print("Weapon Category Mapping:")
for category, value in weapon_mapping.items():
    print(f"'{category}' -> {value}")

Weapon Category Mapping:
'Non-Firearm' -> 0
'Firearm' -> 1


In [9]:
# Split data for training

# Prepare features and target
X = df_encoded.drop(columns=['Weapon Category'])
y = df_encoded['Weapon Category']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)


In [10]:
# Apply SMOTE for handling imbalanced data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [12]:
# Initialize and train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=20,      # Maximum depth of each tree
    min_samples_split=6,  # Minimum samples required to split a node
    min_samples_leaf=3,   # Minimum samples required at each leaf node
    max_features=5,       # Maximum number of features to consider for splits
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

print("Training Random Forest model...")
rf_model.fit(X_train_resampled, y_train_resampled)
print("done!")

Training Random Forest model...
done!


In [13]:
# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

# Calculate metrics
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
class_report = classification_report(y_test, y_pred)

# Print results
print("\nModel Performance Metrics:")
print("-------------------------")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nConfusion Matrix:")
print("----------------")
print("                    Predicted: Non-Firearm    Predicted: Firearm")
print(f"Actual: Non-Firearm {conf_matrix[0][0]:<6}                    {conf_matrix[0][1]:<6}")
print(f"Actual: Firearm     {conf_matrix[1][0]:<6}                    {conf_matrix[1][1]:<6}")

print("\nDetailed Classification Report:")
print("-----------------------------")
print(class_report)


Model Performance Metrics:
-------------------------
Accuracy: 0.6441
Recall: 0.6441
Precision: 0.6512
F1 Score: 0.6473

Confusion Matrix:
----------------
                    Predicted: Non-Firearm    Predicted: Firearm
Actual: Non-Firearm 7543                      7614  
Actual: Firearm     8775                      22122 

Detailed Classification Report:
-----------------------------
              precision    recall  f1-score   support

           0       0.46      0.50      0.48     15157
           1       0.74      0.72      0.73     30897

    accuracy                           0.64     46054
   macro avg       0.60      0.61      0.60     46054
weighted avg       0.65      0.64      0.65     46054



In [16]:
# Run a sample prediction
def make_prediction(
    agency, victim_sex, victim_age, victim_ethnicity,
    perpetrator_sex, perpetrator_ethnicity, relationship,
    region, season
):
    # Encode input features
    input_features = [
        label_encoders['Agency Type'].transform([agency])[0],
        label_encoders['Victim Sex'].transform([victim_sex])[0],
        label_encoders['Victim Age'].transform([str(victim_age)])[0],
        label_encoders['Victim Ethnicity'].transform([victim_ethnicity])[0],
        label_encoders['Perpetrator Sex'].transform([perpetrator_sex])[0],
        label_encoders['Perpetrator Ethnicity'].transform([perpetrator_ethnicity])[0],
        label_encoders['Relationship Category'].transform([relationship])[0],
        label_encoders['Region'].transform([region])[0],
        label_encoders['Season'].transform([season])[0]
    ]
    
    # Scale features
    ##input_features_scaled = scaler.transform([input_features])
    input_df = pd.DataFrame([input_features], columns=X.columns)
    input_features_scaled = scaler.transform(input_df)

    # Make prediction
    pred = rf_model.predict(input_features_scaled)[0]
    pred_proba = rf_model.predict_proba(input_features_scaled)[0]
    
    # Convert prediction to label
    prediction_label = "Firearm" if pred == 1 else "Non-Firearm"
    
    print("\nPrediction for Single Case:")
    print("---------------------------")
    print(f"Predicted Category: {prediction_label}")
    print(f"Probability of Non-Firearm: {pred_proba[0]:.4f}")
    print(f"Probability of Firearm: {pred_proba[1]:.4f}")

# Example usage of prediction function
make_prediction(
    agency="Municipal Police",
    victim_sex="Male",
    victim_age="30",
    victim_ethnicity="Hispanic",
    perpetrator_sex="Male",
    perpetrator_ethnicity="Hispanic",
    relationship="Acquaintance",
    region="Midwest",
    season="Summer"
)


Prediction for Single Case:
---------------------------
Predicted Category: Firearm
Probability of Non-Firearm: 0.4832
Probability of Firearm: 0.5168
