In [None]:
import pandas as pd
import numpy as np
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
area_pre_feature_selection = pd.read_csv('../../data/pre_training/area_pre_feature_selection.csv')
district_pre_feature_selection = pd.read_csv('../../data/pre_training/district_pre_feature_selection.csv')

In [None]:
area_features = area_pre_feature_selection.drop('area_crimes_this_hour', axis=1)
district_features = district_pre_feature_selection.drop('district_crimes_this_hour', axis=1)

area_target = area_pre_feature_selection[['year', 'area_crimes_this_hour']]
district_target = district_pre_feature_selection[['year', 'district_crimes_this_hour']]

In [None]:
# break the area dataset into testing and training datasets
area_feature_training_data = area_features[area_features['year'] < 2020].reset_index(drop=True)
area_feature_testing_data = area_features[area_features['year'] == 2020].reset_index(drop=True)

area_target_training_data = area_target[area_target['year'] < 2020].reset_index(drop=True)
area_target_testing_data = area_target[area_target['year'] == 2020].reset_index(drop=True)

In [None]:
# break the district dataset into testing and training datasets
district_feature_training_data = district_features[district_features['year'] < 2020].reset_index(drop=True)
district_feature_testing_data = district_features[district_features['year'] == 2020].reset_index(drop=True)

district_target_training_data = district_target[district_target['year'] < 2020].reset_index(drop=True)
district_target_testing_data = district_target[district_target['year'] == 2020].reset_index(drop=True)

In [None]:
area_target_training_data = area_target_training_data.drop('year', axis=1)
area_target_testing_data = area_target_testing_data.drop('year', axis=1)
district_target_training_data = district_target_training_data.drop('year', axis=1)
district_target_testing_data = district_target_testing_data.drop('year', axis=1)

In [None]:
xgb_area_feature_training_data = area_feature_training_data.drop('date_hour', axis=1)
xgb_area_feature_testing_data = area_feature_testing_data.drop('date_hour', axis=1)

xgb_district_feature_training_data = district_feature_training_data.drop('date_hour', axis=1)
xgb_district_feature_testing_data = district_feature_testing_data.drop('date_hour', axis=1)

In [None]:
def patch_datatypes(df):
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    int_cols = df.select_dtypes(include=['int64']).columns
    df[int_cols] = df[int_cols].astype(np.int32)    

    cats = df.select_dtypes(exclude=np.number).columns.tolist()

    for col in cats:
        df[col] = df[col].astype('category')
      
    return df

In [None]:
xgb_area_feature_training_data = patch_datatypes(xgb_area_feature_training_data)

In [None]:
xgb_area_feature_testing_data = patch_datatypes(xgb_area_feature_testing_data)

In [None]:
xgb_district_feature_training_data = patch_datatypes(xgb_district_feature_training_data)

In [None]:
xgb_district_feature_testing_data = patch_datatypes(xgb_district_feature_testing_data)

In [None]:
area_dtrain_reg = xgb.DMatrix(xgb_area_feature_training_data, area_target_training_data, enable_categorical=True)
area_dtest_reg = xgb.DMatrix(xgb_area_feature_testing_data, area_target_testing_data, enable_categorical=True)

In [None]:
district_dtrain_reg = xgb.DMatrix(xgb_district_feature_training_data, district_target_training_data, enable_categorical=True)
district_dtest_reg = xgb.DMatrix(xgb_district_feature_testing_data, district_target_testing_data, enable_categorical=True)

##### Training the Area XGBoost Model

In [1]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist", "eta": 0.01}
evals = [(area_dtest_reg, "validation"), (area_dtrain_reg, "train")]

In [None]:
area_model = xgb.train(
   params=params,
   dtrain=area_dtrain_reg,
   num_boost_round=100000,
   evals=evals,
   verbose_eval=50,
   early_stopping_rounds=50
)

In [None]:
district_model = xgb.train(
   params=params,
   dtrain=district_dtrain_reg,
   num_boost_round=100000,
   evals=evals,
   verbose_eval=50,
   early_stopping_rounds=50
)

In [None]:
area_cv_results = xgb.cv(
   params=params,
   dtrain=area_dtrain_reg,
   num_boost_round=100000,
   verbose_eval=50,
   early_stopping_rounds=50,
   nfolds=10
)

In [None]:
district_cv_results = xgb.cv(
   params=params,
   dtrain=district_dtrain_reg,
   num_boost_round=100000,
   verbose_eval=50,
   early_stopping_rounds=50,
   nfolds=10
)

In [None]:
# Calculate initial accuracy
area_y_pred = area_model.predict(xgb_area_feature_testing_data)
area_accuracy = accuracy_score(area_target_testing_data, area_y_pred)

print(f"Initial Accuracy: {area_accuracy:.4f}")

In [None]:
area_thresholds = sort(area_model.feature_importances_)
# Initialize variables to store the best results
area_best_accuracy = area_accuracy
area_best_thresh = None
area_best_features = None

In [None]:
# Iterate over thresholds to find the best feature set
for thresh in area_thresholds:
    # Select features using the threshold
    area_selection = SelectFromModel(area_model, threshold=thresh, prefit=True)
    area_select_X_train = area_selection.transform(xgb_area_feature_training_data)

    # Train the new model with selected features
    area_selection_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1)
    area_selection_model.fit(area_select_X_train, area_target_training_data)

    # Evaluate the new model
    area_select_X_test = area_selection.transform(xgb_area_feature_testing_data)
    area_predictions = area_selection_model.predict(area_select_X_test)
    area_accuracy = accuracy_score(area_target_testing_data, area_predictions)
    
    # Print the results for the current threshold
    print(f"Thresh={thresh:.3f}, n={area_select_X_train.shape[1]}, Accuracy: {area_accuracy*100.0:.2f}%")
    
    # Update the best accuracy and corresponding features if improved
    if area_accuracy > area_best_accuracy:
        area_best_accuracy = area_accuracy
        area_best_thresh = thresh
        area_best_features = area_selection.get_support(indices=True)

# Print the best threshold and corresponding accuracy
print(f"Best Thresh={area_best_thresh:.3f}, Best Accuracy: {area_best_accuracy*100.0:.2f}%")
print(f"Best Features: {area_best_features}")

In [None]:
xgb_area_feature_training_data = xgb_area_feature_training_data.iloc[:, area_best_features]
xgb_area_feature_testing_data = xgb_area_feature_testing_data.iloc[:, area_best_features]

##### Training the District XGBoost Model

In [None]:
district_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1)
district_model.fit(xgb_district_feature_training_data, district_target_training_data.values.ravel())

In [None]:
district_y_pred = district_model.predict(xgb_district_feature_testing_data)
district_accuracy = accuracy_score(district_target_testing_data, district_y_pred)
print(f"Accuracy: {district_accuracy:.4f}")

In [None]:
# Determine thresholds based on feature importances
district_thresholds = np.sort(district_model.feature_importances_)
district_best_accuracy = district_accuracy
district_best_thresh = None
district_best_features = None

In [None]:
for thresh in district_thresholds:
    # Select features using the threshold
    district_selection = SelectFromModel(district_model, threshold=thresh, prefit=True)
    district_select_X_train = district_selection.transform(xgb_district_feature_training_data)

    # Train the new model with selected features
    district_selection_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1)
    district_selection_model.fit(district_select_X_train, district_target_training_data)

    # Evaluate the new model
    district_select_X_test = district_selection.transform(xgb_district_feature_testing_data)
    district_predictions = district_selection_model.predict(district_select_X_test)
    district_accuracy = accuracy_score(district_target_testing_data, district_predictions)
    
    # Print the results for the current threshold
    print(f"Thresh={thresh:.3f}, n={district_select_X_train.shape[1]}, Accuracy: {district_accuracy*100.0:.2f}%")
    
    # Update the best accuracy and corresponding features if improved
    if district_accuracy > district_best_accuracy:
        district_best_accuracy = district_accuracy
        district_best_thresh = thresh
        district_best_features = district_selection.get_support(indices=True)

# Print the best threshold and corresponding accuracy
print(f"Best Thresh={district_best_thresh:.3f}, Best Accuracy: {district_best_accuracy*100.0:.2f}%")
print(f"Best Features: {district_best_features}")

In [None]:
xgb_district_feature_training_data = xgb_district_feature_training_data.iloc[:, district_best_features]
xgb_district_feature_testing_data = xgb_district_feature_testing_data.iloc[:, district_best_features]

##### Hyperparameter Tuning

In [None]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

In [None]:
# Create the XGBoost model object
area_xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
area_grid_search = GridSearchCV(area_xgb_model, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
area_grid_search.fit(xgb_area_feature_training_data, xgb_area_feature_testing_data.values.ravel())

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", area_grid_search.best_params_)
print("Best score: ", area_grid_search.best_score_)

In [None]:
# Create the XGBoost model object
district_xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
district_grid_search = GridSearchCV(area_xgb_model, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
district_grid_search.fit(xgb_district_feature_training_data, xgb_district_feature_testing_data.values.ravel())

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", district_grid_search.best_params_)
print("Best score: ", district_grid_search.best_score_)

##### Training of Final XGBoost Models

In [None]:
# Extract the best parameters from the grid search
area_best_params = area_grid_search.best_params_

# Create the final model with the best parameters
area_final_xgb_model = XGBClassifier(**area_best_params)

In [None]:
# Train the final model with the selected features from the training data
area_final_xgb_model.fit(xgb_area_feature_training_data, area_target_training_data.values.ravel())

In [None]:
# Evaluate the final model on the test data
area_final_predictions = area_final_xgb_model.predict(xgb_area_feature_testing_data)
area_final_accuracy = accuracy_score(area_target_testing_data, area_final_predictions)
print(f"Final Model Accuracy: {area_final_accuracy:.4f}")

In [None]:
# Extract the best parameters from the grid search
district_best_params = district_grid_search.best_params_

# Create the final model with the best parameters
district_final_xgb_model = XGBClassifier(**district_best_params)

In [None]:
# Train the final model with the selected features from the training data
district_final_xgb_model.fit(xgb_district_feature_training_data, district_target_training_data.values.ravel())

In [None]:
# Evaluate the final model on the test data
district_final_predictions = district_final_xgb_model.predict(xgb_district_feature_testing_data)
district_final_accuracy = accuracy_score(district_target_testing_data, district_final_predictions)
print(f"Final Model Accuracy: {district_final_accuracy:.4f}")