## Random Forest (ChiaHui)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Part A - Model Variety Training

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#### Import Dataframe

In [None]:
df_raw = pd.read_csv("C:\Datasets\Crime_Data_from_2020_to_Present.csv", low_memory="False")
df_raw

In [None]:
df_raw.info()

### Data Preparetion 
#### Map each Crime commited to a maytching criminal offense

In [None]:
# Keyword-based mapping rules for auto‐labeling
mapping_rules = {
    "Violent Crime": [
        "ASSAULT", "BATTERY", "HOMICIDE", "MANSLAUGHTER", "RAPE",
        "SEXUAL", "SODOMY", "ORAL COPULATION", "KIDNAPPING",
        "LYNCHING", "STALKING", "THREATS", "INTIMATE PARTNER"
    ],
    "Property Crime": [
        "THEFT", "BURGLARY", "VANDALISM", "ARSON", "SHOPLIFTING",
        "BIKE - STOLEN", "COIN MACHINE"
    ],
    "Vehicle Crime": [
        "VEHICLE", "DRIVING WITHOUT OWNER CONSENT", "DWOC"
    ],
    "Fraud / Financial Crime": [
        "FRAUD", "EMBEZZLEMENT", "COUNTERFEIT", "BUNCO",
        "CREDIT CARD", "DOCUMENT WORTHLESS", "INSURANCE"
    ],
    "Weapons / Public Safety": [
        "FIREARM", "WEAPON", "SHOTS FIRED", "BOMB", "BRANDISH"
    ],
    "Sex Crime": [
        "LEWD", "INDECENT EXPOSURE", "CHILD PORNOGRAPHY",
        "PANDERING", "PIMPING", "HUMAN TRAFFICKING"
    ],
    "Child-Related Crime": [
        "CHILD", "CONTRIBUTING", "CHILD NEGLECT"
    ],
    "Court / Restraining Order / Legal": [
        "COURT", "RESTRAINING", "CONTEMPT", "FAILURE TO APPEAR",
        "VIOLATION"
    ],
    "Public Disturbance / Disorder": [
        "DISTURBANCE", "PEACE", "TRESPASS", "DISRUPT",
        "RIOT", "DISOBEY"
    ],
    "Other Crime": []  # fallback
}

# Function to classify crimes
def classify(description: str):
    if not isinstance(description, str):
        return "Other Crime"
    desc = description.upper()
    for category, keywords in mapping_rules.items():
        for kw in keywords:
            if kw in desc:
                return category
    return "Other Crime"

# Create new class column
df_raw["Crime_Class"] = df_raw["Crm Cd Desc"].apply(classify)

# Save a preview
preview = df_raw[["Crm Cd Desc", "Crime_Class"]].head(30)
preview

In [None]:

df_raw['Crime_Class'].value_counts().reset_index(name='Count')

### Category: Machine Learning Models

#### Tree Based: Random Forest Tree (RAW)

In [None]:
print("=== Random Forest Crime Classification ===")

# Remove Crm Cd Desc to avoid leakage
df_model = df_raw.drop(columns=[
    "Crm Cd Desc", "Crm Cd", "Premis Cd", "Premis Desc",
    "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4"
])

# Prepare training data
X = df_model.drop(columns=["Crime_Class"])
y, class_names = pd.factorize(df_model["Crime_Class"])

# Convert datetime columns to int timestamps
for col in X.select_dtypes(include=['datetime', 'datetimetz']).columns:
    X[col] = X[col].view('int64')

# Factorize object columns
X = X.apply(lambda col: pd.factorize(col)[0] if col.dtype == "object" else col)

# Train-test split (stratify for class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Initialize Random Forest with balanced, regularized params
rf_model_1 = RandomForestClassifier()
# Save paraneter tunning for PART B. 

rf_model_1.fit(X_train, y_train)

# Predictions
y_pred_test_1 = rf_model_1.predict(X_test)
y_pred_train_1 = rf_model_1.predict(X_train)

# Accuracy
train_accuracy = accuracy_score(y_train, y_pred_train_1)
test_accuracy = accuracy_score(y_test, y_pred_test_1)

print("\n=== Performance ===")
print(f"Training Set Accuracy: {train_accuracy}")
print(f"Testing Set Accuracy:  {test_accuracy}")
print(f"Overfit Gap:          {train_accuracy - test_accuracy:.4f}")

# Evaluation reports
report_test = pd.DataFrame.from_dict(
    classification_report(y_test, y_pred_test_1, output_dict=True)
).transpose()

report_train = pd.DataFrame.from_dict(
    classification_report(y_train, y_pred_train_1, output_dict=True)
).transpose()

print("------------------------------------------------------------------------------------")
print("Training Set Report")
print("------------------------------------------------------------------------------------")
print(report_train)
print("------------------------------------------------------------------------------------")
print("Testing Set Report")
print("------------------------------------------------------------------------------------")
print(report_test)
print("------------------------------------------------------------------------------------")

### Part A - Feature Engineering and Transformation

#### Data Cleaning (Check for duplicate)

In [None]:
df_new = df_raw.drop_duplicates()
df_new

In [None]:
df_new = df_new.drop(columns=["Crm Cd Desc", "Crm Cd", "Premis Cd", "Premis Desc", "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4"])
df_new

### Standardization

In [None]:
# 1. Clean DATE OCC (mixed formats)
df_new['DATE OCC'] = pd.to_datetime(df_new['DATE OCC'], format='mixed', errors='coerce')

# 2. Clean TIME OCC (force numeric → Int64 → 4-digit HHMM)
df_new['TIME OCC'] = pd.to_numeric(df_new['TIME OCC'], errors='coerce').astype('Int64')
time_str = df_new['TIME OCC'].astype(str).str.zfill(4)

# 3. Combine DATE OCC + TIME OCC into a single datetime
df_new['DateTime OCC'] = pd.to_datetime(
    df_new['DATE OCC'].dt.strftime('%Y-%m-%d') + ' ' + time_str,
    errors='coerce'
)

# 4. Drop the original columns used for merging
df_new = df_new.drop(columns=['DATE OCC', 'TIME OCC'])

df_new

### Check Null Value 


In [None]:
df_new.isna().sum()


Remove the Weapon Used Cd column, and change the Weapon Desc column to binary.

In [None]:
# 1. Drop the Weapon Used Cd column (if it exists)
df_new = df_new.drop(columns=['Weapon Used Cd'], errors='ignore')

# 2. Create a binary Weapon_Present column
df_new['Weapon_Present'] = df_new['Weapon Desc'].apply(
    lambda x: 'Present' if pd.notna(x) and str(x).strip() != '' else 'Absent'
)

# 3. (Optional) Drop Weapon Desc if you want to fully remove the text info
df_new = df_new.drop(columns=['Weapon Desc'], errors='ignore')

df_new

Dropping columns that provide meaningless value:

In [None]:
plt.figure(figsize=(14, 6))
sns.heatmap(df_new.isna(), cbar=False, yticklabels=False)
plt.title("Missing Value Heatmap")
plt.show()

### Test Pearson Correlation (Numeric Features)


In [None]:
# Choose the DataFrame to analyze; use the most recent processed one if available
try:
    df_corr_source = df_new.copy()
except NameError:
    df_corr_source = df_raw.copy()

# Ensure target encoding (optional): demonstrate correlation against encoded target when present
if 'Crime_Class' in df_corr_source.columns:
    df_corr_source['Crime_Class_numeric'] = df_corr_source['Crime_Class'].astype('category').cat.codes

# Select only numeric columns
num_df = df_corr_source.select_dtypes(include=['number'])

# Pearson correlation matrix
corr = num_df.corr(numeric_only=True)

# Upper triangle flatten for pairwise sorted report
upper = corr.where(~np.tril(np.ones(corr.shape)).astype(bool))
corr_report = (
    upper.stack()
          .reset_index()
          .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
)

# Sort by absolute correlation strength
corr_report = corr_report.iloc[corr_report['Correlation'].abs().sort_values(ascending=False).index]

# Show top pairs
print("Top 25 strongest Pearson correlations (absolute):")
print(corr_report.head(25))

# Optional: heatmap for a quick visual
plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Pearson Correlation (Numeric Features)')
plt.tight_layout()
plt.show()

In [None]:
# Select only numeric columns
num_df_new = df_new.select_dtypes(include=['number'])

# Calculate correlation matrix
corr = num_df_new.corr(numeric_only=True)

# Turn it into a sorted report (pairwise correlations)
corr_report = (
    corr.where(~np.tril(np.ones(corr.shape)).astype(bool))  # keep upper triangle
        .stack()
        .reset_index()
)
corr_report.columns = ["Feature 1", "Feature 2", "Correlation"]

# Sort by absolute correlation strength
corr_report = corr_report.iloc[corr_report['Correlation'].abs().sort_values(ascending=False).index]

corr_report.head(20)   # view top 20 strongest relationships

In [None]:
# Make a copy of the data
df_corr = df_new.copy()

# Convert Crime_Class (categorical) → numeric labels
df_corr['Crime_Class_numeric'] = df_corr['Crime_Class'].astype('category').cat.codes

# Select only numeric columns
num_df = df_corr.select_dtypes(include=['number'])

# Compute correlation with the numeric-encoded target
target_corr = num_df.corr(numeric_only=True)['Crime_Class_numeric']

# Remove the target itself
target_corr = target_corr.drop(labels=['Crime_Class_numeric'])

# Turn into sorted dataframe
target_corr_report = (
    target_corr
        .abs()
        .sort_values(ascending=False)
        .rename("Correlation_with_Crime_Class")
        .to_frame()
)

target_corr_report.head(20)

In [None]:
df_new.info()

### Mocodes

In [None]:
# --- Step 1: Clean & explode the MO Codes column ---
# Convert NaN to empty string
df_new['Mocodes'] = df_new['Mocodes'].fillna('')

# Split by spaces → expand into list
df_new['MOCODES_LIST'] = df_new['Mocodes'].str.strip().str.split()

# Explode (each code becomes a row)
exploded = df_new.explode('MOCODES_LIST')

In [None]:
all_codes = sorted({code for sublist in df_new['MOCODES_LIST'] for code in sublist})
print(len(all_codes), "unique MO codes found")

In [None]:
# --- Step 2: Count MO code frequencies ---
mo_counts = (
    exploded['MOCODES_LIST']
    .value_counts()
)

In [None]:
# --- Step 3: Select the Top 100 codes ---
top_100 = set(mo_counts.head(100).index)

In [None]:
# --- Step 4: Create one-hot columns for each top code ---
for code in top_100:
    df_new[f"MO_{code}"] = df_new['MOCODES_LIST'].apply(lambda lst: code in lst)

# --- Step 5: Create the OTHERS column ---
# OTHERS = true if the row contains any MO code NOT in the top 100
df_new['MO_OTHERS'] = df_new['MOCODES_LIST'].apply(
    lambda lst: any(code not in top_100 for code in lst)
)


In [None]:
# --- Step 6: Convert booleans to integers (0/1) ---
mo_cols = [col for col in df_new.columns if col.startswith("MO_")]
df_new[mo_cols] = df_new[mo_cols].astype(int)

# --- Step 7: Clean up temporary column ---
df_new_1 = df_new.drop(columns=["MOCODES_LIST"])

# --- Done ---
print(f"Created {len(mo_cols)} MO Code features (100 Top + OTHERS).")
print(mo_cols[:10])

In [None]:
df_new_1 = df_new_1.drop(columns=['Mocodes'], errors='ignore')
df_new_1

### Location Baesd Column

In [None]:
df_new_1 = df_new_1.drop(columns=['LOCATION', 'Cross Street', 'DateTime OCC', 'Date Rptd'], errors='ignore') #Remove the dates
df_new_1

In [None]:
df_new_1 = df_new_1.drop(columns=['AREA'])

In [None]:
df_new_1 = df_new_1.drop(columns=['AREA NAME'])
df_new_1

### LON & LAT

In [None]:
from sklearn.cluster import KMeans

coords = df_new_1[['LAT', 'LON']].dropna()

kmeans = KMeans(n_clusters=100, random_state=42)

df_new_1['Location_Cluster'] = kmeans.fit_predict(coords)

In [None]:
df_new_1

In [None]:
df_new_1['Lat_bin'] = pd.cut(df_new_1['LAT'], bins=50, labels=False)
df_new_1['Lon_bin'] = pd.cut(df_new_1['LON'], bins=50, labels=False)

In [None]:

df_new_1

## Remodelling with Processing 

In [None]:
print("=== Random Forest Crime Classification (PROCESSED) ===")

# Remove Crm Cd Desc to avoid leakage
df_model_2 = df_new_1.copy()

# Prepare training data
X = df_model_2.drop(columns=["Crime_Class"])
y, _ = pd.factorize(df_model_2["Crime_Class"])

# Convert datetime columns to int64 timestamps
for col in X.select_dtypes(include=["datetime", "datetimetz"]).columns:
    X[col] = X[col].view('int64')

# Convert list columns to strings so they can be factorized
for col in X.columns:
    if X[col].apply(lambda x: isinstance(x, list)).any():
        X[col] = X[col].astype(str)

# Factorize object columns
X = X.apply(lambda col: pd.factorize(col)[0] if col.dtype == "object" else col)

# Fill any remaining NaN values before training
X = X.fillna(-1)

# Train-test split (stratify for class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Train model (Correctly named rf_model_3)
rf_model_3 = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model_3.fit(X_train, y_train)

# Predict
y_pred = rf_model_3.predict(X_test)
y_pred_train = rf_model_3.predict(X_train)

# Evaluation
report_test = pd.DataFrame.from_dict(
    classification_report(y_test, y_pred, output_dict=True)
).transpose()

report_train = pd.DataFrame.from_dict(
    classification_report(y_train, y_pred_train, output_dict=True)
).transpose()

# Accuracy scores
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

print("Training Set Accuracy:", train_accuracy)
print("Testing Set Accuracy:", test_accuracy)
print("------------------------------------------------------------------------------------")
print("Training Set Report")
print("------------------------------------------------------------------------------------")
print(report_train)
print("------------------------------------------------------------------------------------")
print("Testing Set Report")
print("------------------------------------------------------------------------------------")
print(report_test)
print("------------------------------------------------------------------------------------")

### Build a Confusion Matrix of Errors

In [None]:
# -------------------------------------------
# Feature Importance
# -------------------------------------------
importance = rf_model_3.feature_importances_
features = X.columns

fi = pd.DataFrame({"feature": features, "importance": importance})
fi.sort_values(by="importance", ascending=False).head(20)

### Error VS Complexity Curve

In [None]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Note: X, y, X_train, X_test, y_train, y_test, and rf_model_3 are defined
# in previous cells before running this section

# ================================
# FAST STRATIFIED SUBSAMPLING (1%)
# ================================
# Use a tiny subset to make the curve fast while informative
sample_ratio = 0.01
X_small, _, y_small, _ = train_test_split(
    X, y,
    train_size=sample_ratio,
    stratify=y,
    random_state=42
)

# Split the small subset into train/test
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_small, y_small,
    test_size=0.3,
    stratify=y_small,
    random_state=42
)

# ==================================
# MODEL COMPLEXITY VS ERROR (n_estimators)
# ==================================
# Vary number of trees instead of max_depth; generally faster and clearer
n_estimators_range = [10, 20, 40, 80, 120]
train_losses_curve = []
test_losses_curve = []
train_acc_curve = []
test_acc_curve = []

for n in n_estimators_range:
    model = RandomForestClassifier(
        n_estimators=n,
        max_depth=None,
        n_jobs=-1,
        random_state=42
    )

    model.fit(X_train_s, y_train_s)

    # Probabilities
    train_proba = model.predict_proba(X_train_s)
    test_proba  = model.predict_proba(X_test_s)

    # Loss
    train_losses_curve.append(log_loss(y_train_s, train_proba))
    test_losses_curve.append(log_loss(y_test_s, test_proba))

    # Accuracy
    y_train_pred = model.predict(X_train_s)
    y_test_pred  = model.predict(X_test_s)

    train_acc_curve.append(accuracy_score(y_train_s, y_train_pred))
    test_acc_curve.append(accuracy_score(y_test_s, y_test_pred))

# ================================
# PLOT
# ================================
plt.figure(figsize=(11,5))
plt.subplot(1,2,1)
plt.plot(n_estimators_range, train_losses_curve, marker='o', label="Training Loss")
plt.plot(n_estimators_range, test_losses_curve, marker='o', label="Validation Loss")
plt.xlabel("Model Complexity (n_estimators)")
plt.ylabel("Log Loss")
plt.title("Complexity vs Error (Random Forest)")
plt.legend()

plt.subplot(1,2,2)
plt.plot(n_estimators_range, train_acc_curve, marker='o', label="Training Accuracy")
plt.plot(n_estimators_range, test_acc_curve, marker='o', label="Validation Accuracy")
plt.xlabel("Model Complexity (n_estimators)")
plt.ylabel("Accuracy")
plt.title("Complexity vs Accuracy (Random Forest)")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
depths = range(2, 41, 2)
train_losses_curve = []
test_losses_curve = []
train_acc_curve = []
test_acc_curve = []


for d in depths:
    model = RandomForestClassifier(
        max_depth=d
    )

    model.fit(X_train, y_train)

    # Loss
    train_proba = model.predict_proba(X_train)
    test_proba  = model.predict_proba(X_test)
    train_losses_curve.append(log_loss(y_train, train_proba))
    test_losses_curve.append(log_loss(y_test, test_proba))

    # Accuracy
    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)
    train_acc_curve.append(accuracy_score(y_train, y_train_pred))
    test_acc_curve.append(accuracy_score(y_test, y_test_pred))

plt.figure(figsize=(10,5))
plt.plot(depths, train_losses_curve, label="Training Loss")
plt.plot(depths, test_losses_curve, label="Validation Loss")
plt.xlabel("Model Complexity (max_depth)")
plt.ylabel("Log Loss")
plt.title("Model Complexity vs Error (Decision Tree)")
plt.legend()
plt.show()

### Evaluata Feature Importance (Random Forest)

In [None]:
importance = rf_model_3.feature_importances_
features = X.columns

fi = pd.DataFrame({"feature": features, "importance": importance})
fi.sort_values(by="importance", ascending=False).head(20)

### ShuffleSplit Cross-Validation 

In [None]:
from sklearn.model_selection import ShuffleSplit, cross_val_score
import numpy as np

print("=== ShuffleSplit Cross-Validation (Random Forest) ===")

# Use the same data as the processed model (X and y from the cell above)

# Create a new instance of the Random Forest Classifier with the same parameters
cv_model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Configure ShuffleSplit
# n_splits: Number of re-shuffling & splitting iterations.
# test_size: Proportion of the dataset to include in the test split.
# random_state: Ensures reproducible splits.
shuffle_split_cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

# Perform cross-validation
# This will train the model 5 times on different 70/30 splits of the data.
# It's memory-efficient because it works on splits.
print("Starting cross-validation... (This may take some time depending on your machine)")
cv_scores = cross_val_score(cv_model, X, y, cv=shuffle_split_cv, scoring='accuracy', n_jobs=-1)
print("Cross-validation finished.")

# Print the results
print("\n--- Cross-Validation Scores ---")
print("Scores for each split:", cv_scores)
print(f"Mean Accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation: {np.std(cv_scores):.4f}")