In [1]:
import sys
sys.path.append('../src')
import os
import pandas as pd
from data.make_dataset import load_data, preprocess
from models.train_model import train_model
from models.predict_model import load_model, predict
from tests.statistical_test import perform_stat_tests
from sklearn.metrics import f1_score
from visualization.visualize import plot_monthly_performance, plot_ks_results



In [2]:
target_col = "high_tip"

# Cargar y procesar datos de enero 2020 (entrenamiento)
train_data, features = preprocess(load_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-01.parquet'), target_col=target_col)


In [3]:
# Entrenar modelo
model = train_model(train_data, features, target_col)

FileNotFoundError: [Errno 2] No such file or directory: 'models/random_forest.joblib'

In [None]:
# Inicializar el diccionario de resultados
monthly_performance = {}
ks_results = {}

In [None]:
# Lista de meses a analizar
months = ['02', '03', '04']

In [None]:
# Cargar datos de prueba, predecir y realizar test estadísticos para cada mes
for month in months:
    test_data, _ = preprocess(load_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-{month}.parquet'), target_col=target_col)
    predictions = predict(test_data, model, features)
    f1 = f1_score(test_data[target_col], predictions)
    monthly_performance[month] = f1
    ks_stats = perform_stat_tests(train_data, test_data, features)
    ks_results[month] = ks_stats
    print(f'Month: 2020-{month}, F1 Score: {f1}')
    for feature, (ks_stat, ks_pvalue) in ks_stats.items():
        print(f'Feature: {feature}, KS Statistic: {ks_stat}, P-value: {ks_pvalue}')


In [None]:
# Visualizar rendimiento mensual
plot_monthly_performance(monthly_performance)

In [None]:
# Guardar el modelo
import joblib
joblib.dump(model, '../models/random_forest.joblib')

In [None]:
# Visualizar resultados de los test estadísticos
plot_ks_results(ks_results, months, features)

In [None]:
plot_ks_results(ks_results, months, features)

In [None]:
# Análisis de Resultados
for month, f1 in monthly_performance.items():
    print(f'Month: 2020-{month}, F1 Score: {f1}')
    print("KS Test Results:")
    for feature, (ks_stat, ks_pvalue) in ks_results[month].items():
        print(f'Feature: {feature}, KS Statistic: {ks_stat}, P-value: {ks_pvalue}')

In [None]:
# Comentarios sobre los resultados
# Aquí analizamos las posibles razones detrás de la variación en el F1 Score entre los meses.
# Los test estadísticos (KS Test) nos ayudan a determinar si la distribución de las características ha cambiado
# significativamente entre los meses, lo cual podría explicar la degradación del rendimiento del modelo.
