In [1]:
import pandas as pd
import glob
import os
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import root_mean_squared_error
le = preprocessing.LabelEncoder()
from tqdm.notebook import tqdm
import json

In [2]:
folds = glob.glob("../results_homicides/folds/*")

In [3]:
def calculate_sae(y_pred,  y_true):
    OldRange = (y_pred.max() - y_pred.min())
    NewRange = (y_true.max() - y_true.min())
    y_pred = (((y_pred - y_pred.min()) * NewRange)/OldRange) + y_true.min()
    sae =  abs(y_pred - y_true).sum()
    return sae

def calculate_rmse(y_pred,  y_true):
    rmse = root_mean_squared_error(y_true, y_pred)
    return rmse

In [4]:
def convert_txt_to_json(input_path, output_path):

    df = pd.read_csv(input_path, header=None)
    features_list = df.iloc[:, 0].astype(str).tolist()
    
    data_json = {'selected_features': features_list}
    
    with open(output_path, 'w') as f:
        json.dump(data_json, f, indent=4)

In [5]:
#Deterministic feature selection

from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

results = {}

n_features = 18  # Adjust to the number of features selected by CFS

df_all = pd.read_csv('../datasets/br_data_census_2010_homicides.csv',low_memory=False,index_col=0)

# Data sanitization - Removing GEO_ and GLOBAL_ features
filter_col = [col for col in df_all if col.startswith('GEO')]
df_all = df_all[df_all.columns.drop(list(filter_col))]
filter_col2 = [col for col in df_all if col.startswith('GLOBAL')]
df_all = df_all[df_all.columns.drop(list(filter_col2))]

X_train = df_all.iloc[:, :-1]
y_train = df_all.iloc[:, -1]

# Set random state for reproducibility
X_train = X_train.sample(frac=1, random_state=42).reset_index(drop=True)
y_train = y_train.sample(frac=1, random_state=42).reset_index(drop=True)

# Model for RFE
estimator = DecisionTreeRegressor(max_depth=5, random_state=42)
selector = RFE(estimator, n_features_to_select=n_features, step=100)

for column_name in X_train.columns:
    if X_train[column_name].dtype == object:
        X_train[column_name] = le.fit_transform(X_train[column_name])

selector.fit(X_train, y_train)

# Save RFE-selected features
pd.DataFrame(X_train.columns[selector.support_]).to_csv(
    'features_selected_homicides/features_hom_rfe.txt', index=False, header=False
)

# Model for feature importance
rf = RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

# Save RF-selected features
pd.DataFrame(X_train.columns[rf.feature_importances_.argsort()[::-1][0:n_features]]).to_csv(
    'features_selected_homicides/features_hom_rf.txt', index=False, header=False
)

In [6]:
convert_txt_to_json('features_selected_homicides/features_hom_rfe.txt', 'features_selected_homicides/rfe.json')
convert_txt_to_json('features_selected_homicides/features_hom_rf.txt', 'features_selected_homicides/rf.json')

In [None]:
# Step: Update folds with calculated features (run only if needed).
# For each fold:
# - Load train/test datasets and split into features (X) and target (y).
# - Apply label encoding to categorical columns.
# - Perform feature selection using RFE and train RandomForest for feature importance.
# - Evaluate KNN models for various k values using features selected by RFE and RandomForest.
# - Store evaluation results (SAE and RMSE) in the `results` dictionary.

for fold in tqdm(folds):
    print(fold)
    results[fold] = {} 
    
    df_train = pd.read_csv(os.path.join(fold,'train.csv'),low_memory=False)
    df_test = pd.read_csv(os.path.join(fold,'test.csv'),low_memory=False)
    
    X_train = df_train.iloc[:,:-1]
    y_train = df_train.iloc[:,-1] 

    X_test = df_test.iloc[:,:-1]
    y_test = df_test.iloc[:,-1] 

    
    estimator = DecisionTreeRegressor(max_depth=5)
    selector = RFE(estimator, n_features_to_select=n_features, step=100)

    for column_name in X_train.columns:
        if X_train[column_name].dtype == object:
            X_train[column_name] = le.fit_transform(X_train[column_name])
            X_test[column_name] = le.fit_transform(X_test[column_name])

    selector.fit(X_train, y_train)

    rf = RandomForestRegressor(n_estimators=10,max_depth=5)
    rf.fit(X_train,y_train)
    
    for k in range(10,61,10):
        results[fold][k] = {}
        
        model = KNeighborsRegressor(n_neighbors=k)
        
        model.fit(X_train.iloc[:,selector.support_],y_train)
        y_pred = model.predict(X_test.iloc[:,selector.support_])
        results[fold][k]['RFE'] = {}
        results[fold][k]['RFE']['sae'] = calculate_sae(y_pred,y_test), 
        results[fold][k]['RFE']['rmse'] = calculate_rmse(y_pred,y_test)

        
        model.fit(X_train.iloc[:,rf.feature_importances_.argsort()[::-1][0:n_features]],y_train)
        y_pred = model.predict(X_test.iloc[:,rf.feature_importances_.argsort()[::-1][0:n_features]])
        results[fold][k]['RF'] = {}
        results[fold][k]['RF']['sae'] = calculate_sae(y_pred,y_test), 
        results[fold][k]['RF']['rmse'] = calculate_rmse(y_pred,y_test)