In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
area_pre_feature_selection = pd.read_csv('../../data/pre_training/area_pre_feature_selection.csv')
district_pre_feature_selection = pd.read_csv('../../data/pre_training/district_pre_feature_selection.csv')

In [None]:
area_features = area_pre_feature_selection.drop('area_crimes_this_hour', axis=1)
district_features = district_pre_feature_selection.drop('district_crimes_this_hour', axis=1)

area_target = area_pre_feature_selection[['year', 'area_crimes_this_hour']]
district_target = district_pre_feature_selection[['year', 'district_crimes_this_hour']]

In [None]:
# break the area dataset into testing and training datasets
area_feature_training_data = area_features[area_features['year'] < 2020].reset_index(drop=True)
area_feature_testing_data = area_features[area_features['year'] == 2020].reset_index(drop=True)

area_target_training_data = area_target[area_target['year'] < 2020].reset_index(drop=True)
area_target_testing_data = area_target[area_target['year'] == 2020].reset_index(drop=True)

In [None]:
# break the district dataset into testing and training datasets
district_feature_training_data = district_features[district_features['year'] < 2020].reset_index(drop=True)
district_feature_testing_data = district_features[district_features['year'] == 2020].reset_index(drop=True)

district_target_training_data = district_target[district_target['year'] < 2020].reset_index(drop=True)
district_target_testing_data = district_target[district_target['year'] == 2020].reset_index(drop=True)

In [None]:
area_target_training_data = area_target_training_data.drop('year', axis=1)
area_target_testing_data = area_target_testing_data.drop('year', axis=1)
district_target_training_data = district_target_training_data.drop('year', axis=1)
district_target_testing_data = district_target_testing_data.drop('year', axis=1)

##### Final Feature Engineering

In [None]:
lr_area_feature_training_data = area_feature_training_data.drop('date_hour', axis=1)
lr_area_feature_testing_data = area_feature_testing_data.drop('date_hour', axis=1)

lr_district_feature_training_data = district_feature_training_data.drop('date_hour', axis=1)
lr_district_feature_testing_data = district_feature_testing_data.drop('date_hour', axis=1)

In [None]:
# target encoding of district/area columns
area_means = area_pre_feature_selection.groupby('area_id')['area_crimes_this_hour'].mean()
district_means = district_pre_feature_selection.groupby('district')['district_crimes_this_hour'].mean()

lr_area_feature_training_data['area_id_target_encoded'] = lr_area_feature_training_data['area_id'].map(area_means)
lr_area_feature_testing_data['area_id_target_encoded'] = lr_area_feature_testing_data['area_id'].map(area_means)

lr_district_feature_training_data['district_target_encoded'] = lr_district_feature_training_data['district'].map(district_means)
lr_district_feature_testing_data['district_target_encoded'] = lr_district_feature_testing_data['district'].map(district_means)

In [None]:
# frequency encoding of district/area columns
area_freq = area_pre_feature_selection['area_id'].value_counts() / len(area_pre_feature_selection)
district_freq = district_pre_feature_selection['district'].value_counts() / len(district_pre_feature_selection)

lr_area_feature_training_data['area_id_freq_encoded'] = lr_area_feature_training_data['area_id'].map(area_freq)
lr_area_feature_testing_data['area_id_freq_encoded'] = lr_area_feature_testing_data['area_id'].map(area_freq)

lr_district_feature_training_data['district_freq_encoded'] = lr_district_feature_training_data['district'].map(district_freq)
lr_district_feature_testing_data['district_freq_encoded'] = lr_district_feature_testing_data['district'].map(district_freq)

In [None]:
lr_area_feature_training_data.drop('area_id', axis=1, inplace=True)
lr_area_feature_testing_data.drop('area_id', axis=1, inplace=True)

lr_district_feature_training_data.drop('district', axis=1, inplace=True)
lr_district_feature_testing_data.drop('district', axis=1, inplace=True)

In [None]:
def patch_datatypes(df):
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    int_cols = df.select_dtypes(include=['int64']).columns
    df[int_cols] = df[int_cols].astype(np.int32)    
      
    return df

In [None]:
lr_area_feature_training_data = patch_datatypes(lr_area_feature_training_data)

In [None]:
lr_area_feature_testing_data = patch_datatypes(lr_area_feature_testing_data)

In [None]:
lr_district_feature_training_data = patch_datatypes(lr_district_feature_training_data)

In [None]:
lr_district_feature_testing_data = patch_datatypes(lr_district_feature_testing_data)

In [None]:
def generate_correlation_heatmap(df):
    # Generate a mask to onlyshow the bottom triangle
    mask = np.triu(np.ones_like(df.corr(), dtype=bool))

    # generate heatmap
    plt.figure(figsize=(70,70))
    sns.heatmap(df.corr(), annot=True, mask=mask, vmin=-1, vmax=1)
    plt.title('Correlation Coefficient Of Area Crime Predictors')
    plt.show()

In [None]:
generate_correlation_heatmap(lr_area_feature_training_data)

In [None]:
generate_correlation_heatmap(lr_district_feature_training_data)

##### Using VIF to Remove Multicollinearity

In [None]:
# Function to compute VIF for all features
def compute_vif(feature_df):
    print(f"{datetime.now()} - Starting VIF computation")
    X = feature_df.copy()
    # The calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # Create dataframe to store VIF values
    vif = pd.DataFrame()
    vif["feature"] = X.columns
    vif["vif"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['feature'] != 'intercept']
    
    print(f"{datetime.now()} - Completed VIF computation")
    return vif

In [None]:
# Function to optimize VIF by dropping features with high VIF values
def optimize_vif(feature_df, vif_threshold):
    print(f"{datetime.now()} - Starting VIF optimization")
    df = feature_df.copy()       

    vif_df = compute_vif(feature_df)
    
    while (vif_df['vif'] >= vif_threshold).any():
        print(f"{datetime.now()} - Current VIF values:\n{vif_df}")
        largest_vif_feature = vif_df.loc[vif_df['vif'].idxmax(), 'feature']
        print(f"{datetime.now()} - Dropping feature: {largest_vif_feature} with VIF score of: {vif_df['vif'].max()}")
        df = df.drop(columns=[largest_vif_feature])
        vif_df = compute_vif(df)
    
    print(f"{datetime.now()} - Completed VIF optimization")
    return vif_df

In [None]:
lr_area_selected_features_ten = optimize_vif(lr_area_feature_training_data, 10)

In [None]:
lr_district_selected_features_ten = optimize_vif(lr_district_feature_training_data, 10)

In [None]:
lr_area_selected_features_ten = pd.DataFrame([
    "day",
    "hour",
    "year",
    "month",
    "day_of_week",
    "area_unemployment",
    "area_per_capita_income",
    "area_no_hs_dip",
    "area_gov_depend",
    "area_crowded_housing",
    "area_below_pov",
    "district",
    "police_stations_distance_0.1",
    "police_stations_distance_0.3",
    "police_stations_distance_0.5",
    "police_stations_distance_1",
    "police_stations_distance_3",
    "bike_stations_distance_0.1",
    "bike_stations_distance_0.3",
    "bike_stations_distance_0.5",
    "bus_stops_distance_0.1",
    "bus_stops_distance_0.3",
    "train_stations_distance_0.1",
    "train_stations_distance_0.3",
    "train_stations_distance_0.5",
    "train_stations_distance_1",
    "train_stations_distance_5",
    "alleylights_distance_0.1",
    "alleylights_distance_0.3",
    "alleylights_distance_0.5",
    "alleylights_distance_3",
    "streetlights_allout_distance_0.1",
    "streetlights_allout_distance_0.3",
    "streetlights_allout_distance_0.5",
    "streetlights_allout_distance_1",
    "streetlights_allout_distance_5",
    "streetlights_oneout_distance_0.1",
    "streetlights_oneout_distance_0.3",
    "streetlights_oneout_distance_0.5",
    "streetlights_oneout_distance_1",
    "bike_rides_within_0.1_and_5_min",
    "bike_rides_within_0.3_and_5_min",
    "bike_rides_within_0.5_and_5_min",
    "bike_rides_within_0.1_and_15_min",
    "hourly_bike_rides",
    "district_unemployment",
    "district_per_capita_income",
    "district_no_hs_dip",
    "rides",
    "district_crimes_1_hours_prev",
    "district_crimes_3_hours_prev",
    "area_crimes_1_hours_prev",
    "area_crimes_3_hours_prev",
    "area_id_target_encoded",
    "area_id_freq_encoded"
], columns=['feature'])

In [None]:
lr_district_selected_features_ten = pd.DataFrame([
    "day",
    "hour",
    "year",
    "month",
    "day_of_week",
    "area_per_capita_income",
    "area_gov_depend",
    "area_crowded_housing",
    "area_below_pov",
    "police_stations_distance_0.1",
    "police_stations_distance_0.3",
    "police_stations_distance_0.5",
    "police_stations_distance_1",
    "police_stations_distance_3",
    "bike_stations_distance_0.1",
    "bike_stations_distance_0.3",
    "bus_stops_distance_0.1",
    "bus_stops_distance_0.3",
    "train_stations_distance_0.1",
    "train_stations_distance_0.3",
    "train_stations_distance_1",
    "train_stations_distance_5",
    "alleylights_distance_0.1",
    "alleylights_distance_0.3",
    "alleylights_distance_1",
    "alleylights_distance_5",
    "streetlights_allout_distance_0.1",
    "streetlights_allout_distance_0.3",
    "streetlights_allout_distance_0.5",
    "streetlights_allout_distance_1",
    "streetlights_allout_distance_5",
    "streetlights_oneout_distance_0.1",
    "streetlights_oneout_distance_0.3",
    "streetlights_oneout_distance_0.5",
    "streetlights_oneout_distance_3",
    "bike_rides_within_0.1_and_5_min",
    "bike_rides_within_0.3_and_5_min",
    "bike_rides_within_0.5_and_5_min",
    "bike_rides_within_0.1_and_15_min",
    "hourly_bike_rides",
    "district_per_capita_income",
    "district_crowded_housing",
    "disadvantaged_score",
    "rides",
    "district_crimes_1_hours_prev",
    "district_crimes_3_hours_prev",
    "district_crimes_6_hours_prev",
    "area_crimes_1_hours_prev",
    "area_crimes_3_hours_prev",
    "district_target_encoded",
    "district_freq_encoded"
], columns=['feature'])

##### Using SFS for Feature Selection

In [None]:
area_model = LinearRegression()
area_sfs = SFS(area_model, k_features='best', forward=True, floating=False, scoring='neg_mean_squared_error', cv=5, verbose=2)

In [None]:
lr_area_feature_training_data = lr_area_feature_training_data[list(lr_area_selected_features_ten['feature'].values)]

In [None]:
area_sfs.fit(lr_area_feature_training_data, area_target_training_data)

In [None]:
area_sfs = SFS(area_model, k_features=(9, 19), forward=True, floating=False, scoring='neg_mean_squared_error', cv=5, verbose=2)

In [None]:
area_sfs.fit(lr_area_feature_training_data, area_target_training_data)

In [None]:
district_model = LinearRegression()
district_sfs = SFS(district_model, k_features='best', forward=True, floating=False, scoring='neg_mean_squared_error', cv=5, verbose=2)

In [None]:
lr_district_feature_training_data = lr_district_feature_training_data[list(lr_district_selected_features_ten['feature'].values)]

In [None]:
district_sfs.fit(lr_district_feature_training_data, district_target_training_data)

##### Model Training

In [None]:
# Define the final area model
area_selected_features = area_sfs.feature_names
lr_area_feature_training_data = lr_area_feature_training_data[list(area_selected_features)]
lr_area_feature_testing_data = lr_area_feature_testing_data[list(area_selected_features)]

In [None]:
# Define the final district model
district_selected_features = district_sfs.feature_names
lr_district_feature_training_data = lr_district_feature_training_data[list(district_selected_features)]
lr_district_feature_testing_data = lr_district_feature_testing_data[list(district_selected_features)]

In [None]:
# Train the final area model
area_final_lr_model = LinearRegression()
area_final_lr_model.fit(lr_area_feature_training_data, area_target_training_data)

In [None]:
# Train the final district model
district_final_lr_model = LinearRegression()
district_final_lr_model.fit(lr_district_feature_training_data, district_target_training_data)

##### Model Testing

In [None]:
# Predict using the area model
area_predictions = area_final_lr_model.predict(lr_area_feature_testing_data)

In [None]:
# Calculate evaluation metrics for the area model
area_mse = mean_squared_error(area_target_testing_data, area_predictions)
area_rmse = np.sqrt(area_mse)
area_mae = mean_absolute_error(area_target_testing_data, area_predictions)
area_r2 = r2_score(area_target_testing_data, area_predictions)

# Print evaluation metrics for the area model
print("Area Model Performance Metrics:")
print(f"Mean Squared Error (MSE): {area_mse}")
print(f"Root Mean Squared Error (RMSE): {area_rmse}")
print(f"Mean Absolute Error (MAE): {area_mae}")
print(f"R^2 Score: {area_r2}")

In [None]:
# Predict using the district model
district_predictions = district_final_lr_model.predict(lr_district_feature_testing_data)

In [None]:
# Calculate evaluation metrics for the district model
district_mse = mean_squared_error(district_target_testing_data, district_predictions)
district_rmse = np.sqrt(district_mse)
district_mae = mean_absolute_error(district_target_testing_data, district_predictions)
district_r2 = r2_score(district_target_testing_data, district_predictions)

# Print evaluation metrics for the district model
print("District Model Performance Metrics:")
print(f"Mean Squared Error (MSE): {district_mse}")
print(f"Root Mean Squared Error (RMSE): {district_rmse}")
print(f"Mean Absolute Error (MAE): {district_mae}")
print(f"R^2 Score: {district_r2}")