In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef, log_loss
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')


In [2]:
data  = pd.read_csv('clean_df_1Mar2024.csv')

In [3]:
data['Date_Rptd'] = pd.to_datetime(data['Date_Rptd'], format='%m/%d/%Y %I:%M:%S %p')
data['DATE_OCC'] = pd.to_datetime(data['DATE_OCC'], format='%Y-%m-%d')
data['Day_of_Week'] = data['DATE_OCC'].dt.dayofweek.astype('category')
data['Month'] = data['DATE_OCC'].dt.month.astype('category')
data['Year'] = data['DATE_OCC'].dt.year.astype('category')

# Drop the original 'Date_Rptd' and 'DATE_OCC' columns
data = data.drop(['Date_Rptd', 'DATE_OCC'], axis=1)

# Convert 'Vict_Sex' and 'Vict_Descent' to dummy variables
categorical_to_convert = ['Vict_Sex', 'Region_Ethnic_Origin', 'AREA']
data = pd.get_dummies(data, columns=categorical_to_convert, drop_first=True, sparse=True)
data['Day_of_Week'] = data['Day_of_Week'].astype('category')
data['Month'] = data['Month'].astype('category')
data['Year'] = data['Year'].astype('category')

# Create dummy variables for these columns
data = pd.get_dummies(data, columns=['Day_of_Week', 'Month', 'Year'], drop_first=True, sparse = True)


# We will not convert 'LOCATION' due to its high cardinality
data.drop(['LOCATION', 'Vict_Descent', 'DR_NO'], axis=1, inplace=True)

In [4]:
data = data.drop(columns=['Crm_Cd_Desc', 'Crime_Category', 'Crm_Cd', 'AREA_NAME', 'Rpt_Dist_No', 'LAT', 'LON'], axis = 0)

In [5]:
data.columns

Index(['TIME_OCC', 'Part_1-2', 'Vict_Age', 'Avg_Temp', 'Avg_Dewpoint',
       'Avg_Humidity', 'Avg_Windspeed', 'Avg_Pressure', 'Total_Precipitation',
       'Crime_Category_Code', 'Weapon_Reported', 'Vict_Sex_M', 'Vict_Sex_X',
       'Region_Ethnic_Origin_Black',
       'Region_Ethnic_Origin_Hispanic/Latin/Mexican',
       'Region_Ethnic_Origin_Other', 'Region_Ethnic_Origin_Unknown',
       'Region_Ethnic_Origin_White', 'AREA_2', 'AREA_3', 'AREA_4', 'AREA_5',
       'AREA_6', 'AREA_7', 'AREA_8', 'AREA_9', 'AREA_10', 'AREA_11', 'AREA_12',
       'AREA_13', 'AREA_14', 'AREA_15', 'AREA_16', 'AREA_17', 'AREA_18',
       'AREA_19', 'AREA_20', 'AREA_21', 'Day_of_Week_1', 'Day_of_Week_2',
       'Day_of_Week_3', 'Day_of_Week_4', 'Day_of_Week_5', 'Day_of_Week_6',
       'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Year_2011',
       'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015', 'Year_2016',
       'Yea

In [6]:
X_lr_full = data.drop('Crime_Category_Code', axis=1)  # Features
y_lr_full = data['Crime_Category_Code']

In [7]:
X_lr_full_train, X_lr_full_test, y_lr_full_train, y_lr_full_test = train_test_split(X_lr_full, y_lr_full, test_size=0.2, random_state=42)

In [8]:
lr_full_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, random_state=42, n_jobs = -1)

In [None]:
lr_full_model.fit(X_lr_full_train,  y_lr_full_train)

In [None]:
y_pred_lr_full = lr_full_model.predict(X_lr_full_test)

# Accuracy
accuracy = accuracy_score(y_lr_full_test, y_pred_lr_full)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print(classification_report(y_lr_full_test, y_pred_lr_full))

# Confusion Matrix


In [None]:
cm = confusion_matrix(y_lr_full_test, y_pred_lr_full)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
X_lr_wo_victim = data.drop(['Crime_Category_Code', 'Vict_Sex_M', 'Vict_Sex_X', 'Region_Ethnic_Origin_Black',
       'Region_Ethnic_Origin_Hispanic/Latin/Mexican',
       'Region_Ethnic_Origin_Other', 'Region_Ethnic_Origin_Unknown',
       'Region_Ethnic_Origin_White', 'Vict_Age'], axis=1)  # Features
y_lr_wo_victim = data['Crime_Category_Code']

In [None]:
X_lr_wo_victim_train, X_lr_wo_victim_test, y_lr_wo_victim_train, y_lr_wo_victim_test = train_test_split(X_lr_wo_victim, y_lr_wo_victim, test_size=0.2, random_state=42)

In [None]:
lr_wo_victim_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, random_state=42, n_jobs = -1)

In [None]:
lr_wo_victim_model.fit(X_lr_wo_victim_train,  y_lr_wo_victim_train)

In [None]:
y_pred_wo_victim = lr_wo_victim_model.predict(X_lr_wo_victim_test)

# Accuracy
accuracy = accuracy_score(y_lr_wo_victim_test, y_pred_wo_victim)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print(classification_report(y_lr_wo_victim_test, y_pred_wo_victim))

# Confusion Matrix
cm = confusion_matrix(y_lr_wo_victim_test, y_pred_wo_victim)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
X_lr_wo_weather = data.drop(['Crime_Category_Code', 'Avg_Temp',
       'Avg_Dewpoint', 'Avg_Humidity', 'Avg_Windspeed', 'Avg_Pressure',
       'Total_Precipitation'], axis=1) 
y_lr_wo_weather = data['Crime_Category_Code']

In [None]:
X_lr_wo_weather_train, X_lr_wo_weather_test, y_lr_wo_weather_train, y_lr_wo_weather_test = train_test_split(X_lr_wo_weather, y_lr_wo_weather, test_size=0.2, random_state=42)

In [None]:
lr_wo_weather_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100, random_state=42, n_jobs = -1)

In [None]:
lr_wo_weather_model.fit(X_lr_wo_weather_train,  y_lr_wo_weather_train)

In [None]:
y_pred_wo_weather = lr_wo_weather_model.predict(X_lr_wo_weather_test)

# Accuracy
accuracy = accuracy_score(y_lr_wo_weather_test, y_pred_wo_weather)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print(classification_report(y_lr_wo_weather_test, y_pred_wo_weather))

# Confusion Matrix
cm = confusion_matrix(y_lr_wo_weather_test, y_pred_wo_weather)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
X_lr_wo_weathervictim = data.drop(['Crime_Category_Code', 'Vict_Sex_M', 'Vict_Sex_X', 'Region_Ethnic_Origin_Black',
       'Region_Ethnic_Origin_Hispanic/Latin/Mexican',
       'Region_Ethnic_Origin_Other', 'Region_Ethnic_Origin_Unknown',
       'Region_Ethnic_Origin_White', 'Vict_Age', 'Crime_Category_Code', 'Avg_Temp',
       'Avg_Dewpoint', 'Avg_Humidity', 'Avg_Windspeed', 'Avg_Pressure',
       'Total_Precipitation'], axis=1)  # Features

y_lr_wo_weathervictim = data['Crime_Category_Code']

In [None]:
X_lr_wo_weathervictim_train, X_lr_wo_weathervictim_test, y_lr_wo_weathervictim_train, y_lr_wo_weathervictim_test = train_test_split(X_lr_wo_weathervictim, y_lr_wo_weathervictim, test_size=0.2, random_state=42)

In [None]:
lr_wo_weathervictim_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, random_state=42, n_jobs = -1)

In [None]:
lr_wo_weathervictim_model.fit(X_lr_wo_weathervictim_train,  y_lr_wo_weathervictim_train)

In [None]:
y_pred_wo_weathervictim = lr_wo_weathervictim_model.predict(X_lr_wo_weathervictim_test)

# Accuracy
accuracy = accuracy_score(y_lr_wo_weathervictim_test, y_pred_wo_weathervictim)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print(classification_report(y_lr_wo_weathervictim_test, y_pred_wo_weathervictim))

# Confusion Matrix
cm = confusion_matrix(y_lr_wo_weathervictim_test, y_pred_wo_weathervictim)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
coefs = np.abs(lr_full_model.coef_[0])  # Take absolute value to measure magnitude of importance
features = np.array(X_lr_full.columns)  # X should be your feature dataframe
full_importance = pd.DataFrame(coefs, index=features, columns=['Importance']).sort_values(by='Importance', ascending=False)

print(full_importance)


In [None]:
pd.set_option('display.max_rows', 61)


coefs = np.abs(lr_wo_weather_model.coef_[0])  # Take absolute value to measure magnitude of importance
features = np.array(X_lr_wo_weather.columns)  # X should be your feature dataframe
wo_weather_importance = pd.DataFrame(coefs, index=features, columns=['Importance']).sort_values(by='Importance', ascending=False)

print(wo_weather_importance)


In [None]:
coefs = np.abs(lr_wo_victim_model.coef_[0])  # Take absolute value to measure magnitude of importance
features = np.array(X_lr_wo_victim.columns)  # X should be your feature dataframe
wo_victim_importance = pd.DataFrame(coefs, index=features, columns=['Importance']).sort_values(by='Importance', ascending=False)

print(wo_victim_importance)


In [None]:
coefs = np.abs(lr_wo_weathervictim_model.coef_[0])  # Take absolute value to measure magnitude of importance
features = np.array(X_lr_wo_weathervictim.columns)  # X should be your feature dataframe
wo_weathervictim_importance = pd.DataFrame(coefs, index=features, columns=['Importance']).sort_values(by='Importance', ascending=False)

print(wo_weathervictim_importance)


In [None]:
plt.figure(figsize=(10, 8))  # Adjust the figure size as needed
plt.barh(wo_weathervictim_importance.index, wo_weathervictim_importance['Importance'])
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importances in Logistic Regression Model')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.show()

In [None]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_lr_full_model = CalibratedClassifierCV(lr_full_model, method='sigmoid', cv=5)
calibrated_lr_full_model.fit(X_lr_full_train, y_lr_full_train)

calibrated_lr_wo_weather_model = CalibratedClassifierCV(lr_wo_weather_model, method='sigmoid', cv=5)
calibrated_lr_wo_weather_model.fit(X_lr_wo_weather_train, y_lr_wo_weather_train)

calibrated_lr_wo_victim_model = CalibratedClassifierCV(lr_wo_victim_model, method='sigmoid', cv=5)
calibrated_lr_wo_victim_model.fit(X_lr_wo_victim_train, y_lr_wo_victim_train)

calibrated_lr_wo_weathervictim_model = CalibratedClassifierCV(lr_wo_weathervictim_model, method='sigmoid', cv=5)
calibrated_lr_wo_weathervictim_model.fit(X_lr_wo_weathervictim_train, y_lr_wo_weathervictim_train)

# Now, you can use predict_proba method for log_loss calculation

# Full model
y_pred_lr_full = calibrated_lr_full_model.predict(X_lr_full_test)
mcc = matthews_corrcoef(y_lr_full_test, y_pred_lr_full)
cohen_kappa = cohen_kappa_score(y_lr_full_test, y_pred_lr_full)
y_pred_proba = calibrated_lr_full_model.predict_proba(X_lr_full_test)
logloss = log_loss(y_lr_full_test, y_pred_proba)
print(f"Full Model - Matthews Correlation Coefficient: {mcc:.4f}, Cohen's Kappa: {cohen_kappa:.4f}, Log Loss: {logloss:.4f}")

# Model without weather
y_pred_wo_weather = calibrated_lr_wo_weather_model.predict(X_lr_wo_weather_test)
mcc = matthews_corrcoef(y_lr_wo_weather_test, y_pred_wo_weather)
cohen_kappa = cohen_kappa_score(y_lr_wo_weather_test, y_pred_wo_weather)
y_pred_proba_wo_weather = calibrated_lr_wo_weather_model.predict_proba(X_lr_wo_weather_test)
logloss = log_loss(y_lr_wo_weather_test, y_pred_proba_wo_weather)
print(f"Without Weather - Matthews Correlation Coefficient: {mcc:.4f}, Cohen's Kappa: {cohen_kappa:.4f}, Log Loss: {logloss:.4f}")

# Model without victim
y_pred_wo_victim = calibrated_lr_wo_victim_model.predict(X_lr_wo_victim_test)
mcc = matthews_corrcoef(y_lr_wo_victim_test, y_pred_wo_victim)
cohen_kappa = cohen_kappa_score(y_lr_wo_victim_test, y_pred_wo_victim)
y_pred_proba_wo_victim = calibrated_lr_wo_victim_model.predict_proba(X_lr_wo_victim_test)
logloss = log_loss(y_lr_wo_victim_test, y_pred_proba_wo_victim)
print(f"Without Victim - Matthews Correlation Coefficient: {mcc:.4f}, Cohen's Kappa: {cohen_kappa:.4f}, Log Loss: {logloss:.4f}")

# Model without weather or victim
y_pred_wo_weathervictim = calibrated_lr_wo_weathervictim_model.predict(X_lr_wo_weathervictim_test)
mcc = matthews_corrcoef(y_lr_wo_weathervictim_test, y_pred_wo_weathervictim)
cohen_kappa = cohen_kappa_score(y_lr_wo_weathervictim_test, y_pred_wo_weathervictim)
y_pred_proba_wo_weathervictim = calibrated_lr_wo_weathervictim_model.predict_proba(X_lr_wo_weathervictim_test)
logloss = log_loss(y_lr_wo_weathervictim_test, y_pred_proba_wo_weathervictim)
print(f"Without Weather or Victim - Matthews Correlation Coefficient: {mcc:.4f}, Cohen's Kappa: {cohen_kappa:.4f}, Log Loss: {logloss:.4f}")