In [None]:
import sys
sys.path.append('../src')

import pandas as pd
from data.make_dataset import load_data, preprocess
from models.train_model import train_model
from models.predict_model import load_model, predict
from sklearn.metrics import f1_score
from visualization.visualize import plot_monthly_performance


In [None]:
target_col = "high_tip"

# Cargar y procesar datos de enero 2020 (entrenamiento)
train_data = load_data('../data/raw/yellow_tripdata_2020-01.parquet')
train_data = preprocess(train_data, target_col=target_col)


In [None]:
# Definir características
features = [
    "pickup_weekday",
    "pickup_hour",
    'work_hours',
    "pickup_minute",
    "passenger_count",
    'trip_distance',
    'trip_time',
    'trip_speed',
    "PULocationID",
    "DOLocationID",
    "RatecodeID"
]


In [None]:
# Entrenar modelo
model = train_model(train_data, features, target_col)


In [None]:
# Cargar datos de prueba y predecir para cada mes
monthly_performance = {}
for month in ['02', '03', '04']:
    test_data = load_data(f'../data/raw/yellow_tripdata_2020-{month}.parquet')
    test_data = preprocess(test_data, target_col=target_col)
    predictions = predict(test_data, model, features)
    f1 = f1_score(test_data[target_col], predictions)
    monthly_performance[month] = f1
    print(f'Month: 2020-{month}, F1 Score: {f1}')


In [None]:
# Visualizar rendimiento mensual
plot_monthly_performance(monthly_performance)

In [None]:
# Guardar el modelo
import joblib
joblib.dump(model, '../models/random_forest.joblib')