<a href="https://colab.research.google.com/github/degartHub/nocountry-h12-25-equipo27-datascience/blob/main/main_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Cargando paquetes

In [1]:
!pip install pandas scikit-learn joblib



#Inicio del programa

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Crear datos sintéticos para 100 vuelos
np.random.seed(42)  # Para reproducibilidad

aerolineas = ['Latam', 'Avianca', 'Copa', 'American', 'Delta']
origenes = ['Santiago', 'Bogotá', 'Panamá', 'Miami', 'Nueva York']
destinos = ['Bogotá', 'Panamá', 'Miami', 'Nueva York', 'Santiago']

data = {
    'aerolinea': np.random.choice(aerolineas, 100),
    'origen': np.random.choice(origenes, 100),
    'destino': np.random.choice(destinos, 100),
    'hora_programada': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365), hours=np.random.randint(0, 24)) for _ in range(100)],
    'distancia_km': np.random.randint(500, 5000, 100),
    'retrasado': np.random.choice([0, 1], 100, p=[0.7, 0.3])  # 30% de vuelos retrasados
}

df = pd.DataFrame(data)
print(df.head())  # Muestra las primeras filas para verificar

  aerolinea    origen     destino     hora_programada  distancia_km  retrasado
0  American     Miami      Panamá 2023-02-06 23:00:00          2016          0
1     Delta  Santiago       Miami 2023-12-15 16:00:00          3646          0
2      Copa     Miami      Bogotá 2023-10-29 20:00:00           912          1
3     Delta    Bogotá      Bogotá 2023-04-05 02:00:00          1228          1
4     Delta  Santiago  Nueva York 2023-07-07 00:00:00          3848          0


#Cargar funcionalidades

In [4]:
# Extraer features
df['hora'] = df['hora_programada'].dt.hour
df['dia_semana'] = df['hora_programada'].dt.weekday
df['tipo_aerolinea'] = df['aerolinea'].apply(lambda x: 1 if x in ['American', 'Delta'] else 0)

# Seleccionar features y target
features = ['hora', 'dia_semana', 'tipo_aerolinea', 'distancia_km']
X = df[features]
y = df['retrasado']

print(X.head())  # Ver features

   hora  dia_semana  tipo_aerolinea  distancia_km
0    23           0               1          2016
1    16           4               1          3646
2    20           6               0           912
3     2           2               1          1228
4     0           4               1          3848


#Validar el modelo con train/split

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # O from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)  # O LogisticRegression()
model.fit(X_train, y_train)

# Predecir y validar
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.88      0.94      0.91        16
           1       0.67      0.50      0.57         4

    accuracy                           0.85        20
   macro avg       0.77      0.72      0.74        20
weighted avg       0.84      0.85      0.84        20



#Guardar el modelo con joblib.dump y cargarlo

In [7]:
import joblib

# Guardar el modelo
joblib.dump(model, 'modelo_vuelos.joblib')
print("Modelo guardado como 'modelo_vuelos.joblib'")

Modelo guardado como 'modelo_vuelos.joblib'


#Simulación de carga de modelo

In [8]:
# Cargar el modelo
modelo_cargado = joblib.load('modelo_vuelos.joblib')

# Probar con datos de prueba
y_pred_cargado = modelo_cargado.predict(X_test)
print("Accuracy con modelo cargado:", accuracy_score(y_test, y_pred_cargado))  # Debe ser igual

Accuracy con modelo cargado: 0.85


#Persistirlo en google

In [9]:
from google.colab import drive
drive.mount('/content/drive')
joblib.dump(model, '/content/drive/MyDrive/modelo_vuelos.joblib')

Mounted at /content/drive


['/content/drive/MyDrive/modelo_vuelos.joblib']