Luis Manuel Gallegos Pérez A01659884

Damián Calderón Capallera A01661093

Daniela Martínez Xolalpa A01657901

Fernando Vázquez Rivera A01658933

José de Jesús Rodríguez Rocha A01664806\



In [7]:
# Core libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

# Sklearn processing
from sklearn.feature_extraction.text import TfidfVectorizer

# Sklearn classification algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Sklearn classification model evaluation metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Matplotlib and seaborn for charting
import matplotlib.pyplot as plt # to plot
import seaborn as sns # to plot

In [2]:
# Custom analysis modules
import sys
sys.path.append('../models')
from analysis_modules import *

In [3]:
# Load data
data_file = "C:/Users/dark_/OneDrive/Documentos/01_Escuela/09_Septimo semestre/llamenadios/data/processed/cleaned_crime_data.csv"
datos = pd.read_csv(data_file, header=0)

In [4]:
describeData(datos)

   anio_inicio  mes_inicio fecha_inicio hora_inicio  anio_hecho  mes_hecho  \
0         2016           1   2016-01-01    00:00:00        2015         12   
1         2016           1   2016-01-01    00:00:00        2015         12   
2         2016           1   2016-01-01    00:00:00        2016          1   
3         2016           1   2016-01-01    00:00:00        2015         12   
4         2016           1   2016-01-01    00:00:00        2015         12   

  fecha_hecho hora_hecho                                             delito  \
0  2015-12-31   16:30:00  LESIONES CULPOSAS POR TRANSITO VEHICULAR EN CO...   
1  2015-12-31   22:40:00      ROBO A PASAJERO A BORDO DE TAXI CON VIOLENCIA   
2  2016-01-01   00:20:00     ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA   
3  2015-12-31   22:00:00  ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...   
4  2015-12-31   22:30:00                   HOMICIDIOS INTENCIONALES (OTROS)   

                                    categoria_delito  ..

### 2.2 Preprocessing

In [5]:
# --------------------
def preprocesar(dataSet=0):
    """
    Preprocess crime data
    """

    # Eliminate null values
    dataSet.dropna(inplace=True)

    # Keep only relevant columns
    columnas = ['hour_hecho', 'violence_type', 'alcaldia_hecho', 'colonia_hecho', 'crime_classification']
    dataSet = dataSet[columnas]

    # Perform one-hot encoding on 'crime_classification'
    dataSet = pd.get_dummies(dataSet, columns=['crime_classification'], prefix='crime')

    # Convert boolean columns to binary (0 or 1)
    for col in dataSet.columns:
        if dataSet[col].dtype == 'bool':
            dataSet[col] = dataSet[col].astype(int)

    return dataSet

In [6]:
preprocessed_data = preprocesar(datos)
display(preprocessed_data.head())

Unnamed: 0,hour_hecho,violence_type,alcaldia_hecho,colonia_hecho,crime_Family,crime_Freedom and Sexual Segurity,crime_Life and Integrity,crime_Others,crime_Patrimony,crime_Personal Freedom,crime_Society
0,16,Violent,TLALPAN,JARDINES EN LA MONTAÑA,0,0,1,0,0,0,0
1,22,Violent,TLALPAN,LOMAS DE PADIERNA,0,0,0,0,1,0,0
2,0,Violent,IZTAPALAPA,SAN ANTONIO CULHUACAN,0,0,0,0,1,0,0
3,22,Violent,GUSTAVO A. MADERO,SAN JUAN DE ARAGON II SECCION,0,0,0,0,1,0,0
4,22,Violent,BENITO JUAREZ,NATIVITAS,0,0,1,0,0,0,0


### 2.3 Split Data

In [8]:
# Select columns related to crime classification
crime_columns = [col for col in preprocessed_data.columns if col.startswith('crime_')]
crime_data = preprocessed_data[crime_columns]
display(crime_data.head())

Unnamed: 0,crime_Family,crime_Freedom and Sexual Segurity,crime_Life and Integrity,crime_Others,crime_Patrimony,crime_Personal Freedom,crime_Society
0,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0


In [9]:
def splitDataSet(dataSet, test_size=0.25, randSplit=True, stratify=None):

  # Assuming the last 7 columns are the one-hot encoded labels
  labels = dataSet.iloc[:, -7:]
  features = dataSet.iloc[:, :-7]

  trainSet, testSet, trainLabels, testLabels = train_test_split(
      features, labels, test_size=test_size, random_state=42 if randSplit else None, stratify=stratify
  )

  return trainSet, testSet, trainLabels, testLabels

In [16]:
# ---- Compare XGBoost with and without PCA (sampled run)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from IPython.display import HTML, display

# Build feature and label columns
crime_cols = [c for c in preprocessed_data.columns if c.startswith('crime_')]
feature_cols = [c for c in preprocessed_data.columns if c not in crime_cols]

# Create DataFrame with features first, labels last
df_for_split = pd.concat([
    preprocessed_data[feature_cols].reset_index(drop=True),
    preprocessed_data[crime_cols].reset_index(drop=True)
], axis=1)

# Sample to keep runtime reasonable (set to None to use all rows)
sample_n = 50000
if sample_n is not None and len(df_for_split) > sample_n:
    df_for_split = df_for_split.sample(n=sample_n, random_state=42).reset_index(drop=True)

# Now split (splitDataSet expects labels as the last n columns)
trainX, testX, trainY, testY = splitDataSet(df_for_split, test_size=0.25, randSplit=True, stratify=None)

# Work on copies
trainX_copy = trainX.copy()
testX_copy = testX.copy()
trainY_copy = trainY.copy()
testY_copy = testY.copy()

print(f"trainX shape: {trainX_copy.shape}, testX shape: {testX_copy.shape}")

# Convert one-hot labels to single-class integer labels (argmax)
if trainY_copy.shape[1] > 1:
    y_train = trainY_copy.values.argmax(axis=1)
    y_test = testY_copy.values.argmax(axis=1)
else:
    y_train = trainY_copy.values.ravel()
    y_test = testY_copy.values.ravel()

# Encode categorical features by applying get_dummies on the combined set to ensure consistent columns
combined = pd.concat([trainX_copy.reset_index(drop=True), testX_copy.reset_index(drop=True)], axis=0)
combined_ohe = pd.get_dummies(combined, drop_first=True)
train_len = len(trainX_copy)
trainX_ohe = combined_ohe.iloc[:train_len, :].reset_index(drop=True)
testX_ohe = combined_ohe.iloc[train_len:, :].reset_index(drop=True)

print(f"After encoding: train {trainX_ohe.shape}, test {testX_ohe.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(trainX_ohe)
X_test_scaled = scaler.transform(testX_ohe)

# ------ Pipeline A: With PCA (95% var) ------
pca = PCA(n_components=0.95, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"After PCA: train {X_train_pca.shape}, test {X_test_pca.shape}, components={pca.n_components_}")

clf_pca = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=50, random_state=42)
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)
acc_pca = accuracy_score(y_test, y_pred_pca)
report_pca = classification_report(y_test, y_pred_pca, output_dict=True)
rep_pca_df = pd.DataFrame(report_pca).T

# ------ Pipeline B: Without PCA (use scaled features directly) ------
clf_nopca = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=50, random_state=42)
clf_nopca.fit(X_train_scaled, y_train)
y_pred_nopca = clf_nopca.predict(X_test_scaled)
acc_nopca = accuracy_score(y_test, y_pred_nopca)
report_nopca = classification_report(y_test, y_pred_nopca, output_dict=True)
rep_nopca_df = pd.DataFrame(report_nopca).T

# Prepare side-by-side HTML tables
summary_html = (
    f"<div style='display:flex;gap:30px'>"
    f"<div style='flex:1'><h3>With PCA (n_components={pca.n_components_})</h3>"
    f"<p><b>Accuracy:</b> {acc_pca:.5f}</p>"
    f"{rep_pca_df.to_html()}</div>"
    f"<div style='flex:1'><h3>Without PCA (scaled features)</h3>"
    f"<p><b>Accuracy:</b> {acc_nopca:.5f}</p>"
    f"{rep_nopca_df.to_html()}</div>"
    f"</div>"
)

display(HTML(summary_html))

# Also print a compact table comparing accuracies
compare_df = pd.DataFrame({
    'Pipeline': ['With PCA', 'Without PCA'],
    'Accuracy': [acc_pca, acc_nopca],
    'n_features_after': [X_train_pca.shape[1], X_train_scaled.shape[1]]
})

display(compare_df.style.set_table_attributes("style='display:inline-block;margin-right:30px'"))


trainX shape: (37500, 4), testX shape: (12500, 4)
After encoding: train (37500, 1526), test (12500, 1526)
After encoding: train (37500, 1526), test (12500, 1526)
After PCA: train (37500, 1409), test (12500, 1409), components=1409
After PCA: train (37500, 1409), test (12500, 1409), components=1409


Unnamed: 0,precision,recall,f1-score,support
0,0.407615,0.229798,0.293904,1584.0
1,0.064516,0.005602,0.010309,357.0
2,0.186916,0.026178,0.045924,764.0
3,0.464357,0.269008,0.340665,2591.0
4,0.600161,0.835429,0.698517,7158.0
5,0.0,0.0,0.0,40.0
6,0.0,0.0,0.0,6.0
accuracy,0.56504,0.56504,0.56504,0.56504
macro avg,0.246223,0.195145,0.198474,12500.0
weighted avg,0.504848,0.56504,0.510956,12500.0

Unnamed: 0,precision,recall,f1-score,support
0,0.429799,0.094697,0.155199,1584.0
1,0.0,0.0,0.0,357.0
2,0.3125,0.006545,0.012821,764.0
3,0.518881,0.143188,0.22444,2591.0
4,0.586741,0.936016,0.721322,7158.0
5,0.0,0.0,0.0,40.0
6,0.0,0.0,0.0,6.0
accuracy,0.57808,0.57808,0.57808,0.57808
macro avg,0.263989,0.168635,0.159112,12500.0
weighted avg,0.517109,0.57808,0.48003,12500.0


Unnamed: 0,Pipeline,Accuracy,n_features_after
0,With PCA,0.56504,1409
1,Without PCA,0.57808,1526


## 3. ML methods