In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.io import arff
import os
import scipy.stats as stats
from scipy.stats import lognorm
from scipy.stats import norm
from joblib import Parallel, delayed
from tqdm import tqdm

SEED = 14208

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
def process_dataset(dataset_filepath: str, pred_col_name: str):
    # Obtener la extension del archivo
    _, file_extension = os.path.splitext(dataset_filepath)

    # Cargar el dataset según la extensión
    if file_extension == '.arff':
        data = arff.loadarff(dataset_filepath)
        df = pd.DataFrame(data[0])

    elif file_extension == '.csv':
        df = pd.read_csv(dataset_filepath)

    else:
        raise ValueError("Formato no soportado, `dataset_filepath` debe tener una de las siguientes extensiones: .csv, .arff")

    # Separar el dataset en Train y Validation
    train_df, validation_df = train_test_split(df, test_size=0.2, random_state=SEED)
    y_train, y_valid = train_df[pred_col_name], validation_df[pred_col_name]
    X_train, X_valid = train_df.drop(pred_col_name, axis=1), validation_df.drop(pred_col_name, axis=1)

    # Aplicar get_dummies para variables categóricas
    X_train = pd.get_dummies(X_train)
    X_valid = pd.get_dummies(X_valid)
    X_train, X_valid = X_train.align(X_valid, join='left', axis=1, fill_value=0) 

    # Crear y entrenar el modelo de Random Forest
    rf_model = RandomForestRegressor(random_state=SEED)
    rf_model.fit(X_train, y_train)

    # Obtener las predicciones de cada árbol en el bosque
    tree_predictions = []
    for tree in rf_model.estimators_:
        tree_pred = tree.predict(X_valid)
        tree_predictions.append(tree_pred)

    tree_predictions = np.array(tree_predictions).T

    # Evaluar el modelo
    predictions = rf_model.predict(X_valid)
    mse = mean_squared_error(y_valid, predictions)

    return df, mse, predictions, tree_predictions

In [12]:
def lognormal_KS_fit_check(tree_predictions, shift=1e-6, alpha=0.05):
    ks_results = []
    tree_predictions += shift
    for predictions in tree_predictions:
        # Estimate parameters of the lognormal distribution
        shape, loc, scale = stats.lognorm.fit(predictions, floc=0)
        
        # Perform the Kolmogorov-Smirnov test
        ks_stat, p_value = stats.kstest(predictions, 'lognorm', args=(shape, loc, scale))
        ks_results.append((ks_stat, p_value, shape, loc, scale))

    # Proportion of trees that fit the lognormal distribution
    ks_stats = [result[0] for result in ks_results]
    eps = 0.05 * 0.9833
    filtered_stats = [stat for stat in ks_stats if 0.07554-eps <= stat <= 0.07554+eps]
    ks_fit_proportion = len(filtered_stats) / len(ks_stats)
    
    return ks_fit_proportion

In [13]:
def transforme_normal_SW_fit_check(tree_predictions, shift=1e-6, alpha=0.05):
    sw_results = []
    tree_predictions = np.log1p(tree_predictions)

    for predictions in tree_predictions:
        # Perform the Shapiro-Wilk test
        shapiro_stat, shapiro_p_value = stats.shapiro(predictions)
        sw_results.append((shapiro_stat, shapiro_p_value))

    # Proportion of trees that fit the normal distribution
    sw_stats = [result[0] for result in sw_results]
    eps = 0.05 * 0.9833
    filtered_stats = [stat for stat in sw_stats if 0.9833-eps <= stat <= 0.9833+eps]
    sw_fit_proportion = len(filtered_stats) / len(sw_stats)
    
    return sw_fit_proportion

In [14]:
dataset_filepath = 'datasets/titanic_fare_test.arff'  # Aquí se incluirían las rutas a los datasets
pred_col_name = 'Fare'  # Columna a predecir
dataset_titanic, mse_titanic, predictions_titanic, tree_predictions_titanic = process_dataset(dataset_filepath, pred_col_name)

ks_fit_proportion = lognormal_KS_fit_check(tree_predictions_titanic)
print(f"Proportion of trees that fit the lognormal distribution: {ks_fit_proportion:.5f}")

sw_fit_proportion = transforme_normal_SW_fit_check(tree_predictions_titanic)
print(f"Proportion of trees that fit the normal distribution: {sw_fit_proportion:.5f}")

Proportion of trees that fit the lognormal distribution: 0.20992
Proportion of trees that fit the normal distribution: 0.20611
