In [13]:
import pandas as pd
import numpy as np

# Metrics
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    precision_recall_fscore_support
)

pd.set_option("display.max_columns", 120)

df = pd.read_csv('/Users/dhanyaelsajames/DTSC_3602/dtsc-3602_project/cleaned_sexual_assault_dataset.csv')
print("shape:",df.shape)
df.head(3)

shape: (25819, 22)


Unnamed: 0,DR_NO,DATE OCC,Date Rptd,TIME OCC,AREA,AREA NAME,Rpt Dist No,Crm Cd,Crm Cd Desc,Vict Age,Vict Sex,Vict Descent,Premis Cd,Premis Desc,LOCATION,LAT,LON,SexualAssault,Age_Group,DATE_OCC_dt,datetime_occurrence,Premise_Category
0,202013579,08/13/2020 12:00:00 AM,08/18/2020 12:00:00 AM,100,20,Olympic,2014,860,BATTERY WITH SEXUAL CONTACT,49,M,O,501.0,SINGLE FAMILY DWELLING,4000 W 2ND ST,34.0712,-118.3016,1,45+,2020-08-13,2020-08-13 01:00:00,Indoor
1,202100938,11/26/2020 12:00:00 AM,11/26/2020 12:00:00 AM,500,21,Topanga,2138,236,INTIMATE PARTNER - AGGRAVATED ASSAULT,40,M,B,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",20400 SHERMAN WY,34.2011,-118.5794,1,30-44,2020-11-26,2020-11-26 05:00:00,Indoor
2,201606018,02/18/2020 12:00:00 AM,02/19/2020 12:00:00 AM,1600,16,Foothill,1638,236,INTIMATE PARTNER - AGGRAVATED ASSAULT,23,F,O,501.0,SINGLE FAMILY DWELLING,10500 TUJUNGA CANYON BL,34.2596,-118.2927,1,18-29,2020-02-18,2020-02-18 16:00:00,Indoor


In [24]:
# ✅ Ensure datetime format
violent_df['datetime_occurrence'] = pd.to_datetime(violent_df['datetime_occurrence'], errors='coerce')

# ✅ Extract time components
violent_df['occ_hour'] = violent_df['datetime_occurrence'].dt.hour            # Hour of incident
violent_df['occ_dow'] = violent_df['datetime_occurrence'].dt.dayofweek        # Day of week (0 = Mon ... 6 = Sun)
violent_df['is_weekend'] = (violent_df['occ_dow'] >= 5).astype(int)           # Weekend flag

# ✅ Create time of day bins
def time_bin_func(hour):
    if pd.isna(hour): 
        return 'Unknown'
    hour = int(hour)
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

violent_df['time_bin'] = violent_df['occ_hour'].apply(time_bin_func)
violent_df['is_night'] = violent_df['time_bin'].isin(['Evening', 'Night']).astype(int)



# ✅ Display newly engineered datetime features in a clean table format
display(
    violent_df[['datetime_occurrence', 'occ_hour', 'occ_dow', 'is_weekend', 'time_bin', 'is_night']]
    .head(10)
    .style.set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'left')]},
         {'selector': 'td', 'props': [('text-align', 'left')]}]
    )
)

Unnamed: 0,datetime_occurrence,occ_hour,occ_dow,is_weekend,time_bin,is_night
0,2020-08-13 01:00:00,1,3,0,Night,1
1,2020-11-26 05:00:00,5,3,0,Morning,0
2,2020-02-18 16:00:00,16,1,0,Afternoon,0
3,2020-09-21 10:50:00,10,0,0,Morning,0
4,2020-02-22 00:01:00,0,5,1,Night,1
5,2020-05-08 12:30:00,12,4,0,Afternoon,0
6,2020-11-06 17:30:00,17,4,0,Evening,1
7,2020-07-28 09:30:00,9,1,0,Morning,0
8,2020-12-16 18:20:00,18,2,0,Evening,1
9,2020-12-27 19:30:00,19,6,1,Evening,1


In [25]:
# ======================================
# 📌 SECTION 3: Define Features & Target
# ======================================
target = 'Crm Cd Desc'
y = violent_df[target].copy()

features = [
    'Vict Age', 'Age_Group', 'Vict Sex', 'Vict Descent',
    'Premise_Category', 'AREA NAME',
    'occ_hour', 'occ_dow', 'is_weekend', 'time_bin', 'is_night',
    'LAT', 'LON'
]

# Filter to ensure only available features are used
features = [f for f in features if f in violent_df.columns]
X = violent_df[features].copy()

print("✅ Target and feature set ready.")
print("Target Variable:", target)
print("Number of Features:", len(features))
print("Features:", features)
X.head(3)


✅ Target and feature set ready.
Target Variable: Crm Cd Desc
Number of Features: 13
Features: ['Vict Age', 'Age_Group', 'Vict Sex', 'Vict Descent', 'Premise_Category', 'AREA NAME', 'occ_hour', 'occ_dow', 'is_weekend', 'time_bin', 'is_night', 'LAT', 'LON']


Unnamed: 0,Vict Age,Age_Group,Vict Sex,Vict Descent,Premise_Category,AREA NAME,occ_hour,occ_dow,is_weekend,time_bin,is_night,LAT,LON
0,49,45+,M,O,Indoor,Olympic,1,3,0,Night,1,34.0712,-118.3016
1,40,30-44,M,B,Indoor,Topanga,5,3,0,Morning,0,34.2011,-118.5794
2,23,18-29,F,O,Indoor,Foothill,16,1,0,Afternoon,0,34.2596,-118.2927


In [30]:
# ======================================
# 📌 SECTION 4: Train/Test Split
# ======================================

from sklearn.model_selection import train_test_split

# Define target and features (make sure these were already set in Section 3)
target = 'Crm Cd Desc'
y = violent_df[target].copy()

features = [
    'Vict Age', 'Age_Group', 'Vict Sex', 'Vict Descent',
    'Premise_Category', 'AREA NAME',
    'occ_hour', 'occ_dow', 'is_weekend', 'time_bin', 'is_night',
    'LAT', 'LON'
]

# Keep only available valid columns
features = [f for f in features if f in violent_df.columns]

# Define predictor matrix X
X = violent_df[features].copy()

# 🔹 Create stratified train-test split to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3,        # 30% for testing
    random_state=42,      # ensures reproducibility
    stratify=y            # maintains the same class proportions in train/test
)

print("✅ Train/Test split created successfully")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True).round(3))
print("\nClass distribution in test set:")
print(y_test.value_counts(normalize=True).round(3))


✅ Train/Test split created successfully
Training set shape: (18073, 13)
Testing set shape: (7746, 13)

Class distribution in training set:
Crm Cd Desc
INTIMATE PARTNER - AGGRAVATED ASSAULT    0.490
OTHER ASSAULT                            0.163
BATTERY WITH SEXUAL CONTACT              0.161
RAPE, FORCIBLE                           0.145
ORAL COPULATION                          0.028
RAPE, ATTEMPTED                          0.012
Name: proportion, dtype: float64

Class distribution in test set:
Crm Cd Desc
INTIMATE PARTNER - AGGRAVATED ASSAULT    0.490
OTHER ASSAULT                            0.163
BATTERY WITH SEXUAL CONTACT              0.161
RAPE, FORCIBLE                           0.145
ORAL COPULATION                          0.028
RAPE, ATTEMPTED                          0.012
Name: proportion, dtype: float64


In [32]:
# ======================================
# 📌 Baseline: Majority Class Model
# ======================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# ---- Ensure we have X, y, and a split (uses existing split if present) ----
try:
    _ = X_train, X_test, y_train, y_test  # will raise if not defined
except NameError:
    # Fallback: define target/features and split here
    target = 'Crm Cd Desc'
    features = [
        'Vict Age','Age_Group','Vict Sex','Vict Descent','Premise_Category','AREA NAME',
        'occ_hour','occ_dow','is_weekend','time_bin','is_night','LAT','LON'
    ]
    features = [f for f in features if f in violent_df.columns]
    X = violent_df[features].copy()
    y = violent_df[target].copy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )

# ---- Majority class from the TRAINING set only (to avoid leakage) ----
majority_class = y_train.value_counts().idxmax()
print(f"Majority class (from train): {majority_class!r}")

# ---- DummyClassifier: always predicts the most frequent class ----
dummy = DummyClassifier(strategy='most_frequent', random_state=42)
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)

# ---- Metrics (macro to be fair across classes) ----
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec  = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1   = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("\n=== Majority Class Baseline: Test Performance ===")
print(f"Accuracy       : {acc:.3f}")
print(f"Precision (Mac): {prec:.3f}")
print(f"Recall (Macro) : {rec:.3f}")
print(f"F1 Score (Mac) : {f1:.3f}")

# ---- Detailed per-class report ----
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# ---- Confusion matrix as a readable table ----
labels = sorted(y_test.unique().tolist())
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=pd.Index(labels, name="True"), columns=pd.Index(labels, name="Pred"))
print("\nConfusion Matrix (rows=True, cols=Pred):")
display(cm_df)

# ---- Simple fairness check by gender (accuracy per subgroup, if available) ----
if 'Vict Sex' in X_test.columns:
    print("\nFairness — subgroup accuracy by Vict Sex:")
    subgroup_acc = {}
    for g in ['F', 'M']:
        mask = X_test['Vict Sex'] == g
        if mask.any():
            subgroup_acc[g] = accuracy_score(y_test[mask], y_pred[mask])
            print(f"  {g}: {subgroup_acc[g]:.3f}")
    if {'F','M'}.issubset(subgroup_acc):
        gap = abs(subgroup_acc['F'] - subgroup_acc['M'])
        print(f"  Accuracy gap (|F - M|): {gap:.3f}")


Majority class (from train): 'INTIMATE PARTNER - AGGRAVATED ASSAULT'

=== Majority Class Baseline: Test Performance ===
Accuracy       : 0.490
Precision (Mac): 0.082
Recall (Macro) : 0.167
F1 Score (Mac) : 0.110

Classification Report:
                                       precision    recall  f1-score   support

          BATTERY WITH SEXUAL CONTACT       0.00      0.00      0.00      1244
INTIMATE PARTNER - AGGRAVATED ASSAULT       0.49      1.00      0.66      3796
                      ORAL COPULATION       0.00      0.00      0.00       218
                        OTHER ASSAULT       0.00      0.00      0.00      1266
                      RAPE, ATTEMPTED       0.00      0.00      0.00        96
                       RAPE, FORCIBLE       0.00      0.00      0.00      1126

                             accuracy                           0.49      7746
                            macro avg       0.08      0.17      0.11      7746
                         weighted avg       0.24   

Pred,BATTERY WITH SEXUAL CONTACT,INTIMATE PARTNER - AGGRAVATED ASSAULT,ORAL COPULATION,OTHER ASSAULT,"RAPE, ATTEMPTED","RAPE, FORCIBLE"
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BATTERY WITH SEXUAL CONTACT,0,1244,0,0,0,0
INTIMATE PARTNER - AGGRAVATED ASSAULT,0,3796,0,0,0,0
ORAL COPULATION,0,218,0,0,0,0
OTHER ASSAULT,0,1266,0,0,0,0
"RAPE, ATTEMPTED",0,96,0,0,0,0
"RAPE, FORCIBLE",0,1126,0,0,0,0



Fairness — subgroup accuracy by Vict Sex:
  F: 0.485
  M: 0.513
  Accuracy gap (|F - M|): 0.028


In [35]:
# ======================================
# 📌 LOGISTIC REGRESSION: Train, Test, Report
# ======================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# ---------------------------
# 0) Define target & features (uses existing objects if present)
# ---------------------------
try:
    X, y
except NameError:
    target = 'Crm Cd Desc'
    features = [
        'Vict Age','Age_Group','Vict Sex','Vict Descent',
        'Premise_Category','AREA NAME',
        'occ_hour','occ_dow','is_weekend','time_bin','is_night',
        'LAT','LON'
    ]
    # keep only available columns
    features = [f for f in features if f in violent_df.columns]
    X = violent_df[features].copy()
    y = violent_df[target].copy()

# ---------------------------
# 1) Train/Test split (stratified)
# ---------------------------
try:
    X_train, X_test, y_train, y_test
except NameError:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )

# ---------------------------
# 2) Preprocessing
# ---------------------------
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_features     = X.select_dtypes(include=['int64','float64']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features),
    ]
)

# ---------------------------
# 3) Model (Multinomial Logistic Regression)
# ---------------------------
logreg = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=500,
    n_jobs=None # (set if using sklearn >=1.6 supports n_jobs here; otherwise leave None)
)

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', logreg)
])

# ---------------------------
# 4) Optional: 5-fold CV on training set (macro F1 for stability)
# ---------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1 = cross_val_score(pipe, X_train, y_train, cv=cv,
                        scoring='f1_macro', n_jobs=-1)
print("CV (5-fold) Macro-F1: mean={:.3f}  std={:.3f}".format(cv_f1.mean(), cv_f1.std()))

# ---------------------------
# 5) Fit on train, evaluate on test
# ---------------------------
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec  = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1   = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("\n=== Logistic Regression — Test Performance ===")
print(f"Accuracy       : {acc:.3f}")
print(f"Precision (Mac): {prec:.3f}")
print(f"Recall (Macro) : {rec:.3f}")
print(f"F1 Score (Mac) : {f1:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion matrix (readable table)
labels = sorted(y_test.unique().tolist())
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=pd.Index(labels, name="True"), columns=pd.Index(labels, name="Pred"))
print("\nConfusion Matrix (rows=True, cols=Pred):")
display(cm_df)

# ---------------------------
# 6) Simple fairness check: subgroup accuracy by Vict Sex
# ---------------------------
if 'Vict Sex' in X_test.columns:
    print("\nFairness — subgroup accuracy by Vict Sex:")
    subgroup = {}
    for g in ['F','M']:
        mask = (X_test['Vict Sex'] == g)
        if mask.any():
            subgroup[g] = accuracy_score(y_test[mask], y_pred[mask])
            print(f"  {g}: {subgroup[g]:.3f}")
    if {'F','M'}.issubset(subgroup):
        gap = abs(subgroup['F'] - subgroup['M'])
        print(f"  Accuracy gap (|F - M|): {gap:.3f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CV (5-fold) Macro-F1: mean=0.327  std=0.010


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression — Test Performance ===
Accuracy       : 0.598
Precision (Mac): 0.416
Recall (Macro) : 0.324
F1 Score (Mac) : 0.333

Classification Report:
                                       precision    recall  f1-score   support

          BATTERY WITH SEXUAL CONTACT       0.44      0.37      0.40      1244
INTIMATE PARTNER - AGGRAVATED ASSAULT       0.65      0.88      0.75      3796
                      ORAL COPULATION       0.36      0.02      0.03       218
                        OTHER ASSAULT       0.61      0.44      0.51      1266
                      RAPE, ATTEMPTED       0.00      0.00      0.00        96
                       RAPE, FORCIBLE       0.44      0.23      0.30      1126

                             accuracy                           0.60      7746
                            macro avg       0.42      0.32      0.33      7746
                         weighted avg       0.56      0.60      0.56      7746


Confusion Matrix (rows=True, cols=Pred):


Pred,BATTERY WITH SEXUAL CONTACT,INTIMATE PARTNER - AGGRAVATED ASSAULT,ORAL COPULATION,OTHER ASSAULT,"RAPE, ATTEMPTED","RAPE, FORCIBLE"
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BATTERY WITH SEXUAL CONTACT,460,520,1,121,0,142
INTIMATE PARTNER - AGGRAVATED ASSAULT,159,3344,3,185,0,105
ORAL COPULATION,39,103,4,30,0,42
OTHER ASSAULT,208,467,3,562,0,26
"RAPE, ATTEMPTED",16,62,0,2,0,16
"RAPE, FORCIBLE",170,674,0,22,0,260



Fairness — subgroup accuracy by Vict Sex:
  F: 0.568
  M: 0.707
  Accuracy gap (|F - M|): 0.139


In [36]:
# ======================================
# 📌 RANDOM FOREST: Train, Test, Report
# ======================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# ---------------------------
# 0) Define target & features (uses existing X/y if present)
# ---------------------------
try:
    X, y
except NameError:
    target = 'Crm Cd Desc'
    features = [
        'Vict Age','Age_Group','Vict Sex','Vict Descent',
        'Premise_Category','AREA NAME',
        'occ_hour','occ_dow','is_weekend','time_bin','is_night',
        'LAT','LON'
    ]
    features = [f for f in features if f in violent_df.columns]
    X = violent_df[features].copy()
    y = violent_df[target].copy()

# ---------------------------
# 1) Train/Test split (stratified)
# ---------------------------
try:
    X_train, X_test, y_train, y_test
except NameError:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )

# ---------------------------
# 2) Preprocessing
# ---------------------------
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_features     = X.select_dtypes(include=['int64','float64']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features),
    ]
)

# ---------------------------
# 3) Random Forest model (handles imbalance)
# ---------------------------
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    class_weight='balanced_subsample',  # helps with class imbalance
    random_state=42
)

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', rf)
])

# ---------------------------
# 4) Fit and evaluate on test
# ---------------------------
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec  = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1   = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("=== Random Forest — Test Performance ===")
print(f"Accuracy       : {acc:.3f}")
print(f"Precision (Mac): {prec:.3f}")
print(f"Recall (Macro) : {rec:.3f}")
print(f"F1 Score (Mac) : {f1:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion matrix (readable table)
labels = sorted(y_test.unique().tolist())
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=pd.Index(labels, name="True"), columns=pd.Index(labels, name="Pred"))
print("\nConfusion Matrix (rows=True, cols=Pred):")
display(cm_df)

# ---------------------------
# 5) Fairness: subgroup accuracy by Vict Sex
# ---------------------------
if 'Vict Sex' in X_test.columns:
    print("\nFairness — subgroup accuracy by Vict Sex:")
    subgroup = {}
    for g in ['F','M']:
        mask = (X_test['Vict Sex'] == g)
        if mask.any():
            subgroup[g] = accuracy_score(y_test[mask], y_pred[mask])
            print(f"  {g}: {subgroup[g]:.3f}")
    if {'F','M'}.issubset(subgroup):
        gap = abs(subgroup['F'] - subgroup['M'])
        print(f"  Accuracy gap (|F - M|): {gap:.3f}")

# ---------------------------
# 6) Top feature importances (after OHE)
# ---------------------------
# Build human-readable feature names from the fitted preprocessor
pre = pipe.named_steps['preprocess']
feat_names = []

# OHE feature names
if categorical_features:
    ohe = pre.named_transformers_['cat']
    ohe_names = list(ohe.get_feature_names_out(categorical_features))
    feat_names.extend(ohe_names)

# Numeric passthrough names
feat_names.extend(numeric_features)

# Match with RF importances
importances = pipe.named_steps['clf'].feature_importances_
imp_df = (pd.DataFrame({'feature': feat_names, 'importance': importances})
            .sort_values('importance', ascending=False)
            .reset_index(drop=True))

print("\nTop 20 Feature Importances:")
display(imp_df.head(20))


=== Random Forest — Test Performance ===
Accuracy       : 0.564
Precision (Mac): 0.373
Recall (Macro) : 0.371
F1 Score (Mac) : 0.372

Classification Report:
                                       precision    recall  f1-score   support

          BATTERY WITH SEXUAL CONTACT       0.41      0.43      0.42      1244
INTIMATE PARTNER - AGGRAVATED ASSAULT       0.72      0.72      0.72      3796
                      ORAL COPULATION       0.23      0.19      0.21       218
                        OTHER ASSAULT       0.52      0.51      0.52      1266
                      RAPE, ATTEMPTED       0.00      0.00      0.00        96
                       RAPE, FORCIBLE       0.36      0.37      0.37      1126

                             accuracy                           0.56      7746
                            macro avg       0.37      0.37      0.37      7746
                         weighted avg       0.56      0.56      0.56      7746


Confusion Matrix (rows=True, cols=Pred):


Pred,BATTERY WITH SEXUAL CONTACT,INTIMATE PARTNER - AGGRAVATED ASSAULT,ORAL COPULATION,OTHER ASSAULT,"RAPE, ATTEMPTED","RAPE, FORCIBLE"
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BATTERY WITH SEXUAL CONTACT,538,265,41,182,11,207
INTIMATE PARTNER - AGGRAVATED ASSAULT,299,2720,29,335,22,391
ORAL COPULATION,35,62,42,32,0,47
OTHER ASSAULT,242,286,24,649,7,58
"RAPE, ATTEMPTED",17,42,1,6,0,30
"RAPE, FORCIBLE",190,423,45,38,13,417



Fairness — subgroup accuracy by Vict Sex:
  F: 0.526
  M: 0.700
  Accuracy gap (|F - M|): 0.174

Top 20 Feature Importances:


Unnamed: 0,feature,importance
0,LAT,0.150695
1,LON,0.1484
2,Vict Age,0.147505
3,Vict Sex_F,0.03799
4,Premise_Category_Indoor,0.034324
5,is_weekend,0.032344
6,Vict Sex_M,0.030967
7,Vict Descent_H,0.023306
8,Vict Descent_B,0.020724
9,Age_Group_<18,0.020035


In [39]:
# ======================================
# FIX: Force dense output for Gradient Boosting
# ======================================
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

# ---- Target & features (reuse your setup) ----
target = 'Crm Cd Desc'
features = [
    'Vict Age','Age_Group','Vict Sex','Vict Descent',
    'Premise_Category','AREA NAME',
    'occ_hour','occ_dow','is_weekend','time_bin','is_night',
    'LAT','LON'
]
features = [f for f in features if f in violent_df.columns]
X = violent_df[features].copy()
y = violent_df[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_features     = X.select_dtypes(include=['int64','float64']).columns.tolist()

# ---- OneHotEncoder that always returns dense ----
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sklearn >= 1.2
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)         # sklearn < 1.2

preprocess = ColumnTransformer(
    transformers=[
        ('cat', ohe, categorical_features),
        ('num', 'passthrough', numeric_features),
    ],
    # ensure ColumnTransformer outputs dense even if any transformer is sparse
    sparse_threshold=0.0
)

# ---- Class weights -> sample_weight for imbalance ----
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
cw_map = {c: w for c, w in zip(classes, class_weights)}
sample_weight_train = y_train.map(cw_map).values

# ---- Gradient Boosting model ----
gb = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_depth=None,
    max_bins=255,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
)

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', gb)
])

# ---- Fit (dense now) & evaluate ----
pipe.fit(X_train, y_train, clf__sample_weight=sample_weight_train)
y_pred = pipe.predict(X_test)

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec  = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1   = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("=== Gradient Boosting — Test Performance ===")
print(f"Accuracy       : {acc:.3f}")
print(f"Precision (Mac): {prec:.3f}")
print(f"Recall (Macro) : {rec:.3f}")
print(f"F1 Score (Mac) : {f1:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

labels = sorted(y_test.unique().tolist())
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=pd.Index(labels, name="True"), columns=pd.Index(labels, name="Pred"))
print("\nConfusion Matrix (rows=True, cols=Pred):")
display(cm_df)


=== Gradient Boosting — Test Performance ===
Accuracy       : 0.498
Precision (Mac): 0.360
Recall (Macro) : 0.379
F1 Score (Mac) : 0.359

Classification Report:
                                       precision    recall  f1-score   support

          BATTERY WITH SEXUAL CONTACT       0.41      0.40      0.40      1244
INTIMATE PARTNER - AGGRAVATED ASSAULT       0.75      0.57      0.65      3796
                      ORAL COPULATION       0.15      0.30      0.20       218
                        OTHER ASSAULT       0.50      0.53      0.51      1266
                      RAPE, ATTEMPTED       0.01      0.06      0.02        96
                       RAPE, FORCIBLE       0.33      0.42      0.37      1126

                             accuracy                           0.50      7746
                            macro avg       0.36      0.38      0.36      7746
                         weighted avg       0.57      0.50      0.53      7746


Confusion Matrix (rows=True, cols=Pred):


Pred,BATTERY WITH SEXUAL CONTACT,INTIMATE PARTNER - AGGRAVATED ASSAULT,ORAL COPULATION,OTHER ASSAULT,"RAPE, ATTEMPTED","RAPE, FORCIBLE"
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BATTERY WITH SEXUAL CONTACT,492,163,124,160,80,225
INTIMATE PARTNER - AGGRAVATED ASSAULT,277,2147,79,446,274,573
ORAL COPULATION,21,40,65,27,12,53
OTHER ASSAULT,233,198,57,676,43,59
"RAPE, ATTEMPTED",14,30,6,8,6,32
"RAPE, FORCIBLE",153,272,106,44,81,470


In [41]:
# ======================================
# 📌 XGBOOST (with LabelEncoding for y)
# ======================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier

# ---------------------------
# 0) Target & features
# ---------------------------
target = 'Crm Cd Desc'
features = [
    'Vict Age','Age_Group','Vict Sex','Vict Descent',
    'Premise_Category','AREA NAME',
    'occ_hour','occ_dow','is_weekend','time_bin','is_night',
    'LAT','LON'
]
features = [f for f in features if f in violent_df.columns]
X = violent_df[features].copy()
y = violent_df[target].copy()

# ---------------------------
# 1) Train/Test split (stratified)
# ---------------------------
X_train, X_test, y_train_str, y_test_str = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# ---------------------------
# 2) Label-encode y (required by XGBoost multi:softprob)
# ---------------------------
le = LabelEncoder()
y_train = le.fit_transform(y_train_str)   # integers 0..K-1
y_test  = le.transform(y_test_str)
classes = le.classes_                     # original class names
num_class = len(classes)

# ---------------------------
# 3) Preprocessing (keep OHE sparse; XGB handles CSR)
# ---------------------------
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_features     = X.select_dtypes(include=['int64','float64']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features),
    ]
)

# ---------------------------
# 4) Class imbalance -> per-sample weights (on encoded y)
# ---------------------------
class_weights = compute_class_weight(class_weight='balanced', classes=np.arange(num_class), y=y_train)
cw_map = {i: w for i, w in enumerate(class_weights)}
sample_weight_train = np.array([cw_map[i] for i in y_train], dtype=float)

# ---------------------------
# 5) XGBoost model
# ---------------------------
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=num_class,
    eval_metric='mlogloss',
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', xgb)
])

# ---------------------------
# 6) Fit & evaluate
# ---------------------------
pipe.fit(X_train, y_train, clf__sample_weight=sample_weight_train)
y_pred_enc = pipe.predict(X_test)
y_pred_str = le.inverse_transform(y_pred_enc)

# Metrics (macro to value minority classes)
acc  = accuracy_score(y_test_str, y_pred_str)
prec = precision_score(y_test_str, y_pred_str, average='macro', zero_division=0)
rec  = recall_score(y_test_str, y_pred_str, average='macro', zero_division=0)
f1   = f1_score(y_test_str, y_pred_str, average='macro', zero_division=0)

print("=== XGBoost — Test Performance ===")
print(f"Accuracy       : {acc:.3f}")
print(f"Precision (Mac): {prec:.3f}")
print(f"Recall (Macro) : {rec:.3f}")
print(f"F1 Score (Mac) : {f1:.3f}")

print("\nClassification Report:")
print(classification_report(y_test_str, y_pred_str, zero_division=0))

# Confusion matrix with original class names
labels = list(classes)
cm = confusion_matrix(y_test_str, y_pred_str, labels=labels)
cm_df = pd.DataFrame(cm, index=pd.Index(labels, name="True"), columns=pd.Index(labels, name="Pred"))
print("\nConfusion Matrix (rows=True, cols=Pred):")
display(cm_df)

# ---------------------------
# 7) Fairness: subgroup accuracy by Vict Sex (using original labels)
# ---------------------------
if 'Vict Sex' in X_test.columns:
    print("\nFairness — subgroup accuracy by Vict Sex:")
    subgroup = {}
    for g in ['F','M']:
        mask = (X_test['Vict Sex'] == g)
        if mask.any():
            subgroup[g] = accuracy_score(y_test_str[mask], y_pred_str[mask])
            print(f"  {g}: {subgroup[g]:.3f}")
    if {'F','M'}.issubset(subgroup):
        gap = abs(subgroup['F'] - subgroup['M'])
        print(f"  Accuracy gap (|F - M|): {gap:.3f}")

# ---------------------------
# 8) Feature importances (gain) — top 20
# ---------------------------
pre = pipe.named_steps['preprocess']
feat_names = []
if categorical_features:
    ohe = pre.named_transformers_['cat']
    ohe_names = list(ohe.get_feature_names_out(categorical_features))
    feat_names.extend(ohe_names)
feat_names.extend(numeric_features)

importances = pipe.named_steps['clf'].feature_importances_
imp_df = (pd.DataFrame({'feature': feat_names, 'importance': importances})
            .sort_values('importance', ascending=False)
            .reset_index(drop=True))

print("\nTop 20 Feature Importances (XGBoost):")
display(imp_df.head(20))


=== XGBoost — Test Performance ===
Accuracy       : 0.521
Precision (Mac): 0.361
Recall (Macro) : 0.376
F1 Score (Mac) : 0.364

Classification Report:
                                       precision    recall  f1-score   support

          BATTERY WITH SEXUAL CONTACT       0.41      0.42      0.41      1244
INTIMATE PARTNER - AGGRAVATED ASSAULT       0.74      0.60      0.67      3796
                      ORAL COPULATION       0.15      0.22      0.18       218
                        OTHER ASSAULT       0.50      0.54      0.52      1266
                      RAPE, ATTEMPTED       0.01      0.03      0.02        96
                       RAPE, FORCIBLE       0.35      0.44      0.39      1126

                             accuracy                           0.52      7746
                            macro avg       0.36      0.38      0.36      7746
                         weighted avg       0.57      0.52      0.54      7746


Confusion Matrix (rows=True, cols=Pred):


Pred,BATTERY WITH SEXUAL CONTACT,INTIMATE PARTNER - AGGRAVATED ASSAULT,ORAL COPULATION,OTHER ASSAULT,"RAPE, ATTEMPTED","RAPE, FORCIBLE"
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BATTERY WITH SEXUAL CONTACT,521,197,77,182,43,224
INTIMATE PARTNER - AGGRAVATED ASSAULT,318,2288,73,412,152,553
ORAL COPULATION,28,44,49,34,8,55
OTHER ASSAULT,225,228,43,687,30,53
"RAPE, ATTEMPTED",15,31,4,11,3,32
"RAPE, FORCIBLE",169,291,74,45,56,491



Fairness — subgroup accuracy by Vict Sex:
  F: 0.482
  M: 0.664
  Accuracy gap (|F - M|): 0.182

Top 20 Feature Importances (XGBoost):


Unnamed: 0,feature,importance
0,Vict Sex_F,0.088462
1,Vict Sex_M,0.058671
2,Age_Group_<18,0.039677
3,Premise_Category_Indoor,0.038197
4,Vict Descent_Unknown,0.028887
5,time_bin_Night,0.023944
6,Premise_Category_Institution,0.021557
7,AREA NAME_Central,0.021475
8,Vict Descent_A,0.021135
9,Vict Age,0.020853


In [42]:
# --- Prior-incident concentration per AREA NAME ---
df = violent_df.copy()
df = df.sort_values(['AREA NAME', 'datetime_occurrence']).set_index('datetime_occurrence')

def prior_counts(group, window):
    # time-based rolling count within window; exclude current row (-1)
    return group['DR_NO'].rolling(window=window).count().sub(1).clip(lower=0).astype('Int64')

# Compute per area
features_prior = []
for win in ['30D','90D','180D']:
    name = f'prior_{win.lower()}_area'
    df[name] = (df.groupby('AREA NAME', group_keys=False)
                  .apply(lambda g: prior_counts(g, win)))
    features_prior.append(name)

# bring back to original index order
df = df.reset_index()

# attach back to violent_df
violent_df[features_prior] = df[features_prior]

print("Added prior features:", features_prior)
violent_df[features_prior].describe()


Added prior features: ['prior_30d_area', 'prior_90d_area', 'prior_180d_area']


  .apply(lambda g: prior_counts(g, win)))
  .apply(lambda g: prior_counts(g, win)))
  .apply(lambda g: prior_counts(g, win)))


Unnamed: 0,prior_30d_area,prior_90d_area,prior_180d_area
count,25819.0,25819.0,25819.0
mean,26.377745,77.224525,149.813045
std,11.569928,32.395294,64.777851
min,0.0,0.0,0.0
25%,18.0,54.0,106.0
50%,24.0,71.0,137.0
75%,33.0,95.0,187.0
max,73.0,169.0,310.0


In [43]:
# Add the new features to your set
extra = [c for c in ['prior_30d_area','prior_90d_area','prior_180d_area'] if c in violent_df.columns]
features = [
    'Vict Age','Age_Group','Vict Sex','Vict Descent',
    'Premise_Category','AREA NAME',
    'occ_hour','occ_dow','is_weekend','time_bin','is_night',
    'LAT','LON'
] + extra

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

X = violent_df[features].copy()
y = violent_df['Crm Cd Desc'].copy()

X_train, X_test, y_train_str, y_test_str = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Label-encode target
le = LabelEncoder()
y_train = le.fit_transform(y_train_str)
y_test  = le.transform(y_test_str)
classes = le.classes_
num_class = len(classes)

# One-hot for categoricals; passthrough numerics (XGB handles sparse)
cat = X.select_dtypes(include=['object','category']).columns.tolist()
num = X.select_dtypes(include=['int64','float64','Int64']).columns.tolist()

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat),
    ('num', 'passthrough', num),
])

# Class weights for imbalance -> per-sample weights
cls_w = compute_class_weight('balanced', classes=np.arange(num_class), y=y_train)
w_map = {i:w for i,w in enumerate(cls_w)}
sw_train = np.array([w_map[i] for i in y_train], dtype=float)

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=num_class,
    eval_metric='mlogloss',
    n_estimators=450,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ('preprocess', preprocess),
    ('clf', xgb)
])

pipe.fit(X_train, y_train, clf__sample_weight=sw_train)
y_pred_enc = pipe.predict(X_test)
y_pred_str = le.inverse_transform(y_pred_enc)

acc  = accuracy_score(y_test_str, y_pred_str)
prec = precision_score(y_test_str, y_pred_str, average='macro', zero_division=0)
rec  = recall_score(y_test_str, y_pred_str, average='macro', zero_division=0)
f1   = f1_score(y_test_str, y_pred_str, average='macro', zero_division=0)

print("=== XGBoost (+ prior-incident features) — Test ===")
print(f"Accuracy       : {acc:.3f}")
print(f"Precision(Mac) : {prec:.3f}")
print(f"Recall (Mac)   : {rec:.3f}")
print(f"F1 Score (Mac) : {f1:.3f}")

print("\nClassification Report:")
print(classification_report(y_test_str, y_pred_str, zero_division=0))

labels = list(classes)
cm = confusion_matrix(y_test_str, y_pred_str, labels=labels)
display(pd.DataFrame(cm, index=pd.Index(labels, name="True"), columns=pd.Index(labels, name="Pred")))


=== XGBoost (+ prior-incident features) — Test ===
Accuracy       : 0.533
Precision(Mac) : 0.365
Recall (Mac)   : 0.374
F1 Score (Mac) : 0.367

Classification Report:
                                       precision    recall  f1-score   support

          BATTERY WITH SEXUAL CONTACT       0.40      0.43      0.41      1244
INTIMATE PARTNER - AGGRAVATED ASSAULT       0.74      0.63      0.68      3796
                      ORAL COPULATION       0.22      0.21      0.21       218
                        OTHER ASSAULT       0.49      0.55      0.52      1266
                      RAPE, ATTEMPTED       0.01      0.01      0.01        96
                       RAPE, FORCIBLE       0.33      0.43      0.37      1126

                             accuracy                           0.53      7746
                            macro avg       0.37      0.37      0.37      7746
                         weighted avg       0.56      0.53      0.54      7746



Pred,BATTERY WITH SEXUAL CONTACT,INTIMATE PARTNER - AGGRAVATED ASSAULT,ORAL COPULATION,OTHER ASSAULT,"RAPE, ATTEMPTED","RAPE, FORCIBLE"
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BATTERY WITH SEXUAL CONTACT,532,201,53,198,19,241
INTIMATE PARTNER - AGGRAVATED ASSAULT,308,2378,34,419,75,582
ORAL COPULATION,30,44,45,36,5,58
OTHER ASSAULT,241,233,24,690,13,65
"RAPE, ATTEMPTED",16,33,2,10,1,34
"RAPE, FORCIBLE",205,310,46,51,31,483
