In [63]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression

In [27]:
# run after your split (or on df_raw) to see if column exists
print("Crime_Class in train_validate_df:", "Crime_Class" in train_validate_df.columns)
print("Crime_Class in analysis_df:", "Crime_Class" in analysis_df.columns)

Crime_Class in train_validate_df: False
Crime_Class in analysis_df: False


In [64]:
# Ensure mapping and classifier exist (use your mapping)
mapping_rules = {
    "Violent Crime": ["ASSAULT", "BATTERY", "HOMICIDE", "MANSLAUGHTER", "RAPE",
                      "SEXUAL", "SODOMY", "ORAL COPULATION", "KIDNAPPING",
                      "LYNCHING", "STALKING", "THREATS", "INTIMATE PARTNER"],
    "Property Crime": ["THEFT", "BURGLARY", "VANDALISM", "ARSON", "SHOPLIFTING", "BIKE - STOLEN", "COIN MACHINE"],
    "Vehicle Crime": ["VEHICLE", "DRIVING WITHOUT OWNER CONSENT", "DWOC"],
    "Fraud / Financial Crime": ["FRAUD", "EMBEZZLEMENT", "COUNTERFEIT", "BUNCO", "CREDIT CARD", "DOCUMENT WORTHLESS", "INSURANCE"],
    "Weapons / Public Safety": ["FIREARM", "WEAPON", "SHOTS FIRED", "BOMB", "BRANDISH"],
    "Sex Crime": ["LEWD", "INDECENT EXPOSURE", "CHILD PORNOGRAPHY", "PANDERING", "PIMPING", "HUMAN TRAFFICKING"],
    "Child-Related Crime": ["CHILD", "CONTRIBUTING", "CHILD NEGLECT"],
    "Court / Restraining Order / Legal": ["COURT", "RESTRAINING", "CONTEMPT", "FAILURE TO APPEAR", "VIOLATION"],
    "Public Disturbance / Disorder": ["DISTURBANCE", "PEACE", "TRESPASS", "DISRUPT", "RIOT", "DISOBEY"],
    "Other Crime": []
}

def classify(desc):
    if not isinstance(desc, str):
        return "Other Crime"
    d = desc.upper()
    for category, keywords in mapping_rules.items():
        for kw in keywords:
            if kw in d:
                return category
    return "Other Crime"

# If df_raw not in memory, load it (uncomment if needed)
# df_raw = pd.read_csv("Crime_Data_from_2020_to_Present.csv", low_memory=False)

# Create Crime_Class (idempotent)
if "Crime_Class" not in df_raw.columns:
    df_raw["Crime_Class"] = df_raw["Crm Cd Desc"].apply(classify)
else:
    # If column exists but you want to re-create it with the mapping, uncomment:
    # df_raw["Crime_Class"] = df_raw["Crm Cd Desc"].apply(classify)
    pass

# Convert date and get YEAR (idempotent)
df_raw["DATE OCC"] = pd.to_datetime(df_raw["DATE OCC"], errors="coerce")
df_raw["YEAR"] = df_raw["DATE OCC"].dt.year

# Filter to 2024 & 2025
df_24_25 = df_raw[df_raw["YEAR"].isin([2024, 2025])].copy()
print("Total rows for 2024 & 2025:", len(df_24_25))

# Overall Crime_Class counts and percentages
counts = df_24_25["Crime_Class"].value_counts(dropna=False)
pct = df_24_25["Crime_Class"].value_counts(normalize=True, dropna=False) * 100
summary = pd.concat([counts, pct.round(2)], axis=1)
summary.columns = ["count", "percent"]
print("\nCrime_Class distribution (2024 & 2025):")
print(summary)

# Counts by year
print("\nCrime_Class counts by YEAR (2024 vs 2025):")
print(pd.crosstab(df_24_25["YEAR"], df_24_25["Crime_Class"]))


Total rows for 2024 & 2025: 52834

Crime_Class distribution (2024 & 2025):
                                   count  percent
Crime_Class                                      
Property Crime                     29901    56.59
Vehicle Crime                       9358    17.71
Violent Crime                       6686    12.65
Other Crime                         2503     4.74
Public Disturbance / Disorder       2369     4.48
Sex Crime                            709     1.34
Court / Restraining Order / Legal    597     1.13
Weapons / Public Safety              593     1.12
Child-Related Crime                   97     0.18
Fraud / Financial Crime               21     0.04

Crime_Class counts by YEAR (2024 vs 2025):
Crime_Class  Child-Related Crime  Court / Restraining Order / Legal  \
YEAR                                                                  
2024.0                        93                                595   
2025.0                         4                                  2 

In [30]:
# ----------------------------
# 1. Load dataset
# ----------------------------
df_raw = pd.read_csv("Crime_Data_from_2020_to_Present.csv")

# ----------------------------
# 2. Convert dates
# ----------------------------
df_raw["DATE OCC"] = pd.to_datetime(df_raw["DATE OCC"], errors="coerce")
df_raw["YEAR"] = df_raw["DATE OCC"].dt.year

# ----------------------------
# 3. Filter only year 2024 and 2025
# ----------------------------
df = df_raw[df_raw["YEAR"].isin([2024, 2025])].copy()

# Remove YEAR column
df = df.drop(columns=["YEAR"])

print("Total rows (2024 & 2025):", len(df))

# ----------------------------
# 4. Manually split 70% / 30%
# ----------------------------
df = df.sample(frac=1, random_state=42)     # shuffle
split_point = int(len(df) * 0.7)

train_validate_df = df.iloc[:split_point]
analysis_df = df.iloc[split_point:]

# ----------------------------
# 5. Show results
# ----------------------------
print("\n===== SPLIT RESULT =====")
print(f"Train/Validate rows : {len(train_validate_df)}  ({len(train_validate_df)/len(df):.1%})")
print(f"Analysis rows       : {len(analysis_df)}  ({len(analysis_df)/len(df):.1%})")

# ----------------------------
# 6. View samples
# ----------------------------
print("\n===== TRAIN/VALIDATE SAMPLE =====")
display(train_validate_df.head())

print("\n===== ANALYSIS SAMPLE =====")
display(analysis_df.head())

Total rows (2024 & 2025): 52834

===== SPLIT RESULT =====
Train/Validate rows : 36983  (70.0%)
Analysis rows       : 15851  (30.0%)

===== TRAIN/VALIDATE SAMPLE =====


Unnamed: 0,Unnamed: 1,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
956125,242005815,2/12/2024 0:00,2024-02-12,1205,20,Olympic,2041,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,,,,900 5TH AV,,34.0555,-118.3185
924438,240705608,2/7/2024 0:00,2024-02-07,730,7,Wilshire,723,1,310,BURGLARY,...,IC,Invest Cont,310.0,998.0,,,7900 W 4TH ST,,34.0699,-118.3614
957665,241405754,2/10/2024 0:00,2024-02-10,1500,14,Pacific,1444,1,310,BURGLARY,...,AA,Adult Arrest,310.0,,,,900 HARBOR CROSS LN,,33.9876,-118.4489
884970,240804273,1/2/2024 0:00,2024-01-02,100,8,West LA,859,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,,,,8500 SATURN ST,,34.0515,-118.3762
921835,241809137,4/11/2024 0:00,2024-04-10,2200,18,Southeast,1841,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",...,IC,Invest Cont,740.0,,,,800 W 109TH ST,,33.9373,-118.2894



===== ANALYSIS SAMPLE =====


Unnamed: 0,Unnamed: 1,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
997710,240905574,02/13/2024 12:00:00 AM,2024-02-10,2000,9,Van Nuys,941,1,510,VEHICLE - STOLEN,...,IC,Invest Cont,510.0,,,,14800 OXNARD ST,,34.1794,-118.4556
901224,240805274,2/4/2024 0:00,2024-02-04,1030,8,West LA,817,1,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",...,AA,Adult Arrest,341.0,,,,10900 WEYBURN AV,,34.0626,-118.4454
974692,241204505,1/7/2024 0:00,2024-01-07,1530,12,77th Street,1256,2,888,TRESPASSING,...,AO,Adult Other,888.0,,,,7200 S FIGUEROA ST,,33.9747,-118.2827
965801,241908446,5/1/2024 0:00,2024-05-01,930,19,Mission,1956,1,320,"BURGLARY, ATTEMPTED",...,IC,Invest Cont,320.0,998.0,,,10200 BEVIS AV,,34.255,-118.4554
940086,240110014,4/6/2024 0:00,2024-04-06,1940,1,Central,143,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",...,IC,Invest Cont,740.0,,,,7TH,HILL,34.0488,-118.25


## XGBoost

In [44]:
# ============================
# 1. Remove leakage columns
# ============================
df_model = df_raw.drop(columns=[
    "Crm Cd Desc", "Crm Cd", "Premis Cd", "Premis Desc",
    "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4"
])

# ============================
# 3. Prepare features and labels
# ============================
X_train_validate = train_validate_df.drop(columns=["Crime_Class"]).copy()
y_train_validate, _ = pd.factorize(train_validate_df["Crime_Class"])

X_analysis = analysis_df.drop(columns=["Crime_Class"]).copy()
y_analysis, _ = pd.factorize(analysis_df["Crime_Class"])

# ============================
# 4. Handle datetime columns
# ============================
def process_datetime(df):
    for col in df.select_dtypes(include=["datetime64[ns]"]).columns:
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
        df[col + "_weekday"] = df[col].dt.weekday
        df[col + "_hour"] = df[col].dt.hour
        df.drop(columns=[col], inplace=True)
    return df

X_train_validate = process_datetime(X_train_validate)
X_analysis = process_datetime(X_analysis)

# ============================
# 5. Factorize object columns
# ============================
def factorize_objects(df):
    return df.apply(lambda col: pd.factorize(col)[0] if col.dtype == "object" else col)

X_train_validate = factorize_objects(X_train_validate)
X_analysis = factorize_objects(X_analysis)

# ============================
# 6. Train XGBoost model
# ============================
xgb_model_1 = XGBClassifier(
    # Example tuned params (uncomment if needed)
    # n_estimators=300,
    # learning_rate=0.1,
    # max_depth=6,
    # subsample=0.8,
    # colsample_bytree=0.8,
    # eval_metric="mlogloss",
    # random_state=42
)

xgb_model_1.fit(X_train_validate, y_train_validate)

# ============================
# 7. Predictions
# ============================
y_pred_train = xgb_model_1.predict(X_train_validate)
y_pred_analysis = xgb_model_1.predict(X_analysis)

# ============================
# 8. Evaluation reports
# ============================
report_train = pd.DataFrame.from_dict(
    classification_report(y_train_validate, y_pred_train, output_dict=True)
).transpose()

report_analysis = pd.DataFrame.from_dict(
    classification_report(y_analysis, y_pred_analysis, output_dict=True)
).transpose()

# ============================
# 9. Accuracy scores
# ============================
train_accuracy = accuracy_score(y_train_validate, y_pred_train)
analysis_accuracy = accuracy_score(y_analysis, y_pred_analysis)

print("Training/Validation Set Accuracy:", train_accuracy)
print("Analysis Set Accuracy:", analysis_accuracy)
print("------------------------------------------------------------------------------------")
print("Training/Validation Set Report")
print("------------------------------------------------------------------------------------")
print(report_train)
print("------------------------------------------------------------------------------------")
print("Analysis Set Report")
print("------------------------------------------------------------------------------------")
print(report_analysis)
print("------------------------------------------------------------------------------------")

Training/Validation Set Accuracy: 0.952762079874537
Analysis Set Accuracy: 0.5739487690184378
------------------------------------------------------------------------------------
Training/Validation Set Report
------------------------------------------------------------------------------------
              precision    recall  f1-score       support
0              0.968524  0.954489  0.961455  18633.000000
1              0.953857  0.995721  0.974340   9114.000000
2              0.962254  0.765651  0.852768   2364.000000
3              0.868999  0.976270  0.919517   4172.000000
4              0.988473  0.982808  0.985632    698.000000
5              1.000000  0.862069  0.925926    435.000000
6              0.971951  0.926744  0.948810    860.000000
7              0.971487  0.868852  0.917308    549.000000
8              1.000000  1.000000  1.000000     37.000000
9              1.000000  1.000000  1.000000    121.000000
accuracy       0.952762  0.952762  0.952762      0.952762
macro avg

## ANN

In [62]:
# ============================
# 1. Remove leakage columns
# ============================
df_model = df.drop(columns=[
    "Crm Cd Desc", "Crm Cd", "Premis Cd", "Premis Desc",
    "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4"
])

# ============================
# 2. Prepare training data
# ============================
X = df_model.drop(columns=["Crime_Class"])
y, _ = pd.factorize(df_model["Crime_Class"])

# Factorize object columns
X = X.apply(lambda col: pd.factorize(col)[0] if col.dtype == "object" else col)

# Handle datetime columns
for col in X.select_dtypes(include=["datetime64[ns]"]).columns:
    X[col + "_year"] = X[col].dt.year
    X[col + "_month"] = X[col].dt.month
    X[col + "_day"] = X[col].dt.day
    X[col + "_weekday"] = X[col].dt.weekday
    X[col + "_hour"] = X[col].dt.hour
    X.drop(columns=[col], inplace=True)

# Handle missing values
X = X.fillna(X.median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ============================
# 3. Scale features
# ============================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ============================
# 4. One-hot encode labels
# ============================
num_classes = len(np.unique(y))
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

# ============================
# 5. Build ANN model
# ============================
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# ============================
# 6. Train ANN (epochs=10)
# ============================
history = model.fit(
    X_train_scaled, y_train_cat,
    epochs=50,             
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# ============================
# 7. Predictions
# ============================
y_pred_train = model.predict(X_train_scaled).argmax(axis=1)
y_pred_test = model.predict(X_test_scaled).argmax(axis=1)

# ============================
# 8. Evaluation
# ============================
report_train = pd.DataFrame.from_dict(
    classification_report(y_train, y_pred_train, output_dict=True)
).transpose()

report_test = pd.DataFrame.from_dict(
    classification_report(y_test, y_pred_test, output_dict=True)
).transpose()

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Training Set Accuracy:", train_accuracy)
print("Testing Set Accuracy:", test_accuracy)
print("------------------------------------------------------------------------------------")
print("Training Set Report")
print("------------------------------------------------------------------------------------")
print(report_train)
print("------------------------------------------------------------------------------------")
print("Testing Set Report")
print("------------------------------------------------------------------------------------")
print(report_test)
print("------------------------------------------------------------------------------------")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.7389 - loss: 0.8154 - val_accuracy: 0.7853 - val_loss: 0.6579
Epoch 2/50
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7814 - loss: 0.6676 - val_accuracy: 0.7967 - val_loss: 0.6266
Epoch 3/50
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7991 - loss: 0.6133 - val_accuracy: 0.8149 - val_loss: 0.5533
Epoch 4/50
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8163 - loss: 0.5530 - val_accuracy: 0.8222 - val_loss: 0.5119
Epoch 5/50
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8243 - loss: 0.5266 - val_accuracy: 0.8257 - val_loss: 0.5012
Epoch 6/50
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8255 - loss: 0.5150 - val_accuracy: 0.8280 - val_loss: 0.4989
Epoch 7/50
[1m925/925[0m [32m━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Logistic Regression

In [65]:
# ============================
# 1. Remove leakage columns
# ============================
df_model = df.drop(columns=[
    "Crm Cd Desc", "Crm Cd", "Premis Cd", "Premis Desc",
    "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4"
])

# ============================
# 2. Prepare training data
# ============================
X = df_model.drop(columns=["Crime_Class"])
y, class_names = pd.factorize(df_model["Crime_Class"])

# Factorize object columns
X = X.apply(lambda col: pd.factorize(col)[0] if col.dtype == "object" else col)

# Handle datetime columns
for col in X.select_dtypes(include=["datetime64[ns]"]).columns:
    X[col + "_year"] = X[col].dt.year
    X[col + "_month"] = X[col].dt.month
    X[col + "_day"] = X[col].dt.day
    X[col + "_weekday"] = X[col].dt.weekday
    X[col + "_hour"] = X[col].dt.hour
    X.drop(columns=[col], inplace=True)

# Handle missing values
X = X.fillna(X.median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ============================
# 3. Scale features
# ============================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ============================
# 4. Build Logistic Regression model
# ============================
log_reg = LogisticRegression(
    max_iter=1000,       # increase iterations for convergence
    multi_class='multinomial',
    solver='lbfgs',
    random_state=42
)

# ============================
# 5. Train
# ============================
log_reg.fit(X_train_scaled, y_train)

# ============================
# 6. Predictions
# ============================
y_pred_train = log_reg.predict(X_train_scaled)
y_pred_test = log_reg.predict(X_test_scaled)

# ============================
# 7. Evaluation
# ============================
report_train = pd.DataFrame.from_dict(
    classification_report(y_train, y_pred_train, output_dict=True, target_names=class_names)
).transpose()

report_test = pd.DataFrame.from_dict(
    classification_report(y_test, y_pred_test, output_dict=True, target_names=class_names)
).transpose()

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Training Set Accuracy:", train_accuracy)
print("Testing Set Accuracy:", test_accuracy)
print("------------------------------------------------------------------------------------")
print("Training Set Report")
print("------------------------------------------------------------------------------------")
print(report_train)
print("------------------------------------------------------------------------------------")
print("Testing Set Report")
print("------------------------------------------------------------------------------------")
print(report_test)
print("------------------------------------------------------------------------------------")



Training Set Accuracy: 0.7812778844333883
Testing Set Accuracy: 0.7793830042268627
------------------------------------------------------------------------------------
Training Set Report
------------------------------------------------------------------------------------
                                   precision    recall  f1-score       support
Property Crime                      0.830438  0.866877  0.848266  20898.000000
Vehicle Crime                       0.771561  0.962227  0.856410   6592.000000
Violent Crime                       0.637872  0.669996  0.653539   4706.000000
Other Crime                         0.304636  0.052302  0.089277   1759.000000
Public Disturbance / Disorder       0.731210  0.702141  0.716381   1635.000000
Court / Restraining Order / Legal   0.418605  0.044118  0.079823    408.000000
Weapons / Public Safety             0.285714  0.058252  0.096774    412.000000
Sex Crime                           0.000000  0.000000  0.000000    485.000000
Child-Related Cr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
