In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"US/Eastern\"")

In [None]:
hex_project_id = _hex_json.loads("\"d4d16f54-4573-453e-a8de-ecb82a17c382\"")

In [None]:
hex_project_name = _hex_json.loads("\"FinalProjectP2\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE


In [None]:
data = pd.read_csv("finalData.csv")

vehicle_crimes = ["Theft From Motor Vehicle", "Motor Vehicle Theft", "Theft of Motor Vehicle Parts from Vehicle"]

data['Crime_Type'] = data['Crime_Type'].apply(lambda x: 0 if x in vehicle_crimes else 1)

socioeconomic_features = ['TotalPopulationEstimate', 'HousingUnitsOwnerOccupied',
                          'HousingUnitsRenterOccupied', 'UnemploymentRateAge16to19',
                          'UnemploymentRateAge20to24', 'UnemploymentRateAge25to44',
                          'UnemploymentRateAge55to64', 'UnemploymentRateAge65to74',
                          'UnemploymentRateAge75AndOver']

features = data[socioeconomic_features]
target = data['Crime_Type']

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.30, random_state=42)


In [None]:
vehicle_crime_count = (data['Crime_Type'] == 0).sum()
other_crime_count = (data['Crime_Type'] == 1).sum()

print("Number of Vehicle-Related Crimes:", vehicle_crime_count)
print("Number of Other Crimes:", other_crime_count)

Number of Vehicle-Related Crimes: 45798
Number of Other Crimes: 120583


In [None]:
lr_model_unbalanced = LogisticRegression(max_iter=1000)
lr_model_unbalanced.fit(X_train, y_train)
lr_predictions_unbalanced = lr_model_unbalanced.predict(X_test)

print("Unbalanced Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions_unbalanced))
print("Unbalanced Logistic Regression Report:\n", classification_report(y_test, lr_predictions_unbalanced))

rf_model_unbalanced = RandomForestClassifier(n_estimators=100)
rf_model_unbalanced.fit(X_train, y_train)
rf_predictions_unbalanced = rf_model_unbalanced.predict(X_test)

print("Unbalanced Random Forest Accuracy:", accuracy_score(y_test, rf_predictions_unbalanced))
print("Unbalanced Random Forest Report:\n", classification_report(y_test, rf_predictions_unbalanced))

dt_model_unbalanced = DecisionTreeClassifier()
dt_model_unbalanced.fit(X_train, y_train)
dt_predictions_unbalanced = dt_model_unbalanced.predict(X_test)

print("Unbalanced Decision Tree Accuracy:", accuracy_score(y_test, dt_predictions_unbalanced))
print("Unbalanced Decision Tree Report:\n", classification_report(y_test, dt_predictions_unbalanced))

nb_model_unbalanced = GaussianNB()
nb_model_unbalanced.fit(X_train, y_train)
nb_predictions_unbalanced = nb_model_unbalanced.predict(X_test)

print("Unbalanced Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions_unbalanced))
print("Unbalanced Naive Bayes Report:\n", classification_report(y_test, nb_predictions_unbalanced))


Unbalanced Logistic Regression Accuracy: 0.723630171291195
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Unbalanced Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     13795
           1       0.72      1.00      0.84     36120

    accuracy                           0.72     49915
   macro avg       0.36      0.50      0.42     49915
weighted avg       0.52      0.72      0.61     49915

Unbalanced Random Forest Accuracy: 0.7235901031753982
Unbalanced Random Forest Report:
               precision    recall  f1-score   support

           0       0.33      0.00      0.00     13795
           1       0.72      1.00      0.84     36120

    accuracy                           0.72     49915
   macro avg       0.53      0.50      0.42     49915
weighted avg       0.62      0.72      0.61     4

In [None]:
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

print("Balanced Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("Balanced Logistic Regression Report:\n", classification_report(y_test, lr_predictions))


Balanced Logistic Regression Accuracy: 0.5917459681458479
Balanced Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.31      0.37      0.34     13795
           1       0.74      0.67      0.71     36120

    accuracy                           0.59     49915
   macro avg       0.52      0.52      0.52     49915
weighted avg       0.62      0.59      0.60     49915



In [None]:
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest Report:\n", classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.5641190023039167
Random Forest Report:
               precision    recall  f1-score   support

           0       0.31      0.47      0.37     13795
           1       0.75      0.60      0.67     36120

    accuracy                           0.56     49915
   macro avg       0.53      0.53      0.52     49915
weighted avg       0.63      0.56      0.58     49915



In [None]:
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_predictions))
print("Gradient Boosting Report:\n", classification_report(y_test, gb_predictions))


Gradient Boosting Accuracy: 0.7235901031753982
Gradient Boosting Report:
               precision    recall  f1-score   support

           0       0.33      0.00      0.00     13795
           1       0.72      1.00      0.84     36120

    accuracy                           0.72     49915
   macro avg       0.53      0.50      0.42     49915
weighted avg       0.62      0.72      0.61     49915



In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_predictions))
print("Decision Tree Report:\n", classification_report(y_test, dt_predictions))


Decision Tree Accuracy: 0.7236101372332966
Decision Tree Report:
               precision    recall  f1-score   support

           0       0.43      0.00      0.00     13795
           1       0.72      1.00      0.84     36120

    accuracy                           0.72     49915
   macro avg       0.58      0.50      0.42     49915
weighted avg       0.64      0.72      0.61     49915



In [None]:
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_predictions))
print("XGBoost Report:\n", classification_report(y_test, xgb_predictions))


XGBoost Accuracy: 0.7235901031753982
XGBoost Report:
               precision    recall  f1-score   support

           0       0.33      0.00      0.00     13795
           1       0.72      1.00      0.84     36120

    accuracy                           0.72     49915
   macro avg       0.53      0.50      0.42     49915
weighted avg       0.62      0.72      0.61     49915



In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("Naive Bayes Report:\n", classification_report(y_test, nb_predictions))


Naive Bayes Accuracy: 0.697185214865271
Naive Bayes Report:
               precision    recall  f1-score   support

           0       0.32      0.08      0.13     13795
           1       0.73      0.93      0.82     36120

    accuracy                           0.70     49915
   macro avg       0.52      0.51      0.47     49915
weighted avg       0.61      0.70      0.63     49915



In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

skf = StratifiedKFold(n_splits=5)

for name, model in [("Logistic Regression", LogisticRegression(max_iter=1000, class_weight='balanced')),
                    ("Random Forest", RandomForestClassifier(n_estimators=100, class_weight='balanced')),
                    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
                    ("Naive Bayes", GaussianNB())]:
    print(f"{name} - Cross-validation Results for Vehicle-Related Crime Prediction:")
    for score_name, score_func in [('Precision (class 0)', precision_score), 
                                   ('Recall (class 0)', recall_score), 
                                   ('F1 Score (class 0)', f1_score)]:
        cv_results = cross_val_score(model, X_train_smote, y_train_smote, cv=skf, 
                                     scoring=make_scorer(score_func, zero_division=0, average=None, labels=[0]))
        print(f"  {score_name}: {np.mean(cv_results)}")
    print()


Logistic Regression - Cross-validation Results for Vehicle-Related Crime Prediction:
  Precision (class 0): 0.5392532095749848
  Recall (class 0): 0.38029667557383895
  F1 Score (class 0): 0.4460331005617083

Random Forest - Cross-validation Results for Vehicle-Related Crime Prediction:
  Precision (class 0): 0.5385176269723595
  Recall (class 0): 0.46284169689796784
  F1 Score (class 0): 0.49781950525399415

Decision Tree - Cross-validation Results for Vehicle-Related Crime Prediction:
  Precision (class 0): 0.5385112826584828
  Recall (class 0): 0.4628298576746209
  F1 Score (class 0): 0.4978099416774707

Naive Bayes - Cross-validation Results for Vehicle-Related Crime Prediction:
  Precision (class 0): 0.5285984503969341
  Recall (class 0): 0.39213619258847515
  F1 Score (class 0): 0.4502529357967221



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'max_iter': [100, 500, 1000]
}

lr_model = LogisticRegression(class_weight='balanced')

grid_search_lr = GridSearchCV(estimator=lr_model, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_lr.fit(X_train, y_train)

print("Best Parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best Score for Logistic Regression:", grid_search_lr.best_score_)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END ............C=0.001, max_iter=100, solver=newton-cg; total time=  18.0s
[CV] END ............C=0.001, max_iter=100, solver=newton-cg; total time=  20.3s
[CV] END ............C=0.001, max_iter=100, solver=newton-cg; total time=  19.9s
[CV] END ............C=0.001, max_iter=100, solver=newton-cg; total time=  19.0s
[CV] END ............C=0.001, max_iter=100, solver=newton-cg; total time=  20.3s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=   4.4s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=   4.5s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=   4.7s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=   4.4s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=   4.5s
[CV] END ............C=0.001, max_iter=100, solver=liblinear; total time=   0.4s
[CV] END ............C=0.001, max_iter=100, sol