In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.io import arff
import os
import scipy.stats as stats
from scipy.stats import lognorm
from scipy.stats import norm

SEED = 21415

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def process_dataset(dataset_filepath: str, pred_col_name: str):
    # Obtener la extension del archivo
    _, file_extension = os.path.splitext(dataset_filepath)

    # Cargar el dataset según la extensión
    if file_extension == '.arff':
        data = arff.loadarff(dataset_filepath)
        df = pd.DataFrame(data[0])

    elif file_extension == '.csv':
        df = pd.read_csv(dataset_filepath)

    else:
        raise ValueError("Formato no soportado, `dataset_filepath` debe tener una de las siguientes extensiones: .csv, .arff")

    # Separar el dataset en Train y Validation
    train_df, validation_df = train_test_split(df, test_size=0.2, random_state=SEED)
    y_train, y_valid = train_df[pred_col_name], validation_df[pred_col_name]
    X_train, X_valid = train_df.drop(pred_col_name, axis=1), validation_df.drop(pred_col_name, axis=1)

    # Aplicar get_dummies para variables categóricas
    X_train = pd.get_dummies(X_train)
    X_valid = pd.get_dummies(X_valid)
    X_train, X_valid = X_train.align(X_valid, join='left', axis=1, fill_value=0) 

    # Crear y entrenar el modelo de Random Forest
    rf_model = RandomForestRegressor(random_state=SEED)
    rf_model.fit(X_train, y_train)

    # Obtener las predicciones de cada árbol en el bosque
    tree_predictions = []
    for tree in rf_model.estimators_:
        tree_pred = tree.predict(X_valid)
        tree_predictions.append(tree_pred)

    tree_predictions = np.array(tree_predictions).T

    # Evaluar el modelo
    predictions = rf_model.predict(X_valid)
    mse = mean_squared_error(y_valid, predictions)

    return df, mse, predictions, tree_predictions

In [22]:
def lognormal_fit_check(tree_predictions, shift=1e-6):
    ks_results = []
    tree_predictions += shift
    for predictions in tree_predictions:
        # Estimate parameters of the lognormal distribution
        shape, loc, scale = stats.lognorm.fit(predictions, floc=0)
        
        # Perform the Kolmogorov-Smirnov test
        ks_stat, p_value = stats.kstest(predictions, 'lognorm', args=(shape, loc, scale))
        ks_results.append((ks_stat, p_value, shape, loc, scale))
    
    return ks_results

def print_ks_res(tree_predictions):
    ks_results = lognormal_fit_check(tree_predictions)
    avg_ks, avg_pvalue = 0, 0

    for i, (ks_stat, p_value, shape, loc, scale) in enumerate(ks_results):
        avg_ks += ks_stat
        avg_pvalue += p_value

    print(f"AVG KS: {avg_ks / len(ks_results)} | AVG p-value: {avg_pvalue / len(ks_results)}")

def plot_log_normal_fit(predictions, dataset_name, save=False, shift=1e-6, given_index=0):
    predictions += shift

    plt.figure(figsize=(10, 6))

    cmap = get_cmap('tab10')

    prediction = predictions[given_index]

    sns.kdeplot(prediction, color=cmap(given_index % cmap.N))
    shape, loc, scale = lognorm.fit(prediction, floc=0)
    x = np.linspace(min(prediction), max(prediction), 1000)
    pdf_fitted = lognorm.pdf(x, shape, loc, scale)

    plt.plot(x, pdf_fitted, 'b-', lw=2, label='Fitted Log-Normal')

    plt.title(f"Distribucion de las predicciones | Instancia {given_index} [Validation Set]")

    if save:
        plt.savefig(f'graficos/{dataset_name}_distribution_log.png', format='png', dpi=300, bbox_inches='tight')

In [19]:
dataset_filepath = 'titanic_fare_test.arff'  # Aquí se incluirían las rutas a los datasets
pred_col_name = 'Fare'  # Columna a predecir
dataset_titanic, mse_titanic, predictions_titanic, tree_predictions_titanic = process_dataset(dataset_filepath, pred_col_name)

In [20]:
print_ks_res(tree_predictions_titanic)

AVG KS: 0.2752746977608068 | AVG p-value: 0.15117657085914157


In [None]:
for i in range(262):
    plot_log_normal_fit(tree_predictions_titanic, 'Titanic', save=False, shift=1e-6, given_index=i)

In [24]:
import imageio

# Ensure the output directory exists
output_dir = 'gif_frames'
os.makedirs(output_dir, exist_ok=True)

# Save each plot as an image
for i in range(262):
    filename = os.path.join(output_dir, f'frame_{i}.png')
    plot_log_normal_fit(tree_predictions_titanic, 'Titanic', save=True, shift=1e-6, given_index=i)
    plt.savefig(filename)
    plt.close()  # Close the plot to free up memory

# Create a GIF from the saved images
with imageio.get_writer('lognormal_fits.gif', mode='I', duration=2) as writer:
    for i in range(262):
        filename = os.path.join(output_dir, f'frame_{i}.png')
        image = imageio.imread(filename)
        writer.append_data(image)

# Clean up the images (remove files)
# import shutil
# shutil.rmtree(output_dir)