In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


In [2]:
from xgboost import XGBClassifier
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
#from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score

In [3]:
data = pd.read_csv('../../raw_data/latlong.csv')

In [4]:
# NO USAR PARA MANTENER EL DF COMPLETO
data = data.sample(50000)

In [5]:
data = data.dropna()

In [6]:
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,CANCELLED,ORIGIN_LONGITUDE,ORIGIN_LATITUDE,DESTINATION_LONGITUDE,DESTINATION_LATITUDE
15188,3,13,5,AS,GEG,SEA,65.0,224,705,0,-117.53384,47.61986,-122.30931,47.44898
356951,1,29,4,DL,ATL,MSP,160.0,907,2320,0,-84.42694,33.64044,-93.21692,44.88055
681677,9,17,4,EV,LGA,RIC,104.0,292,1225,0,-73.87261,40.77724,-77.31967,37.50517
551163,7,23,4,AA,SAN,JFK,332.0,2446,1617,0,-117.18966,32.73356,-73.77893,40.63975
551315,6,28,7,B6,SLC,JFK,262.0,1990,500,0,-111.97777,40.78839,-73.77893,40.63975


In [7]:
X = data.drop(columns=['CANCELLED', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], axis=1)
y = data['CANCELLED']

variables_numericas = X.select_dtypes(include=['number']).columns
variables_numericas

Index(['MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_TIME', 'DISTANCE',
       'SCHEDULED_ARRIVAL', 'ORIGIN_LONGITUDE', 'ORIGIN_LATITUDE',
       'DESTINATION_LONGITUDE', 'DESTINATION_LATITUDE'],
      dtype='object')

In [8]:
X.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ORIGIN_LONGITUDE,ORIGIN_LATITUDE,DESTINATION_LONGITUDE,DESTINATION_LATITUDE
15188,3,13,5,AS,65.0,224,705,-117.53384,47.61986,-122.30931,47.44898
356951,1,29,4,DL,160.0,907,2320,-84.42694,33.64044,-93.21692,44.88055
681677,9,17,4,EV,104.0,292,1225,-73.87261,40.77724,-77.31967,37.50517
551163,7,23,4,AA,332.0,2446,1617,-117.18966,32.73356,-73.77893,40.63975
551315,6,28,7,B6,262.0,1990,500,-111.97777,40.78839,-73.77893,40.63975


In [9]:
# Split data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [10]:
variables_cat = X.select_dtypes(include=['object']).columns
variables_cat

Index(['AIRLINE'], dtype='object')

In [11]:
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('standard_scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value='NA')),
    ('OneHotEncoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore'))])
    


In [12]:
# Impute then scale numerical values: 
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, variables_numericas),
    ('cat_transformer', cat_transformer, variables_cat)],
    remainder='passthrough')
preprocessor

preprocessor

In [13]:
# Aplicar el pipeline de data
X_preprocesado = preprocessor.fit_transform(X)


In [14]:
X_preprocesado

array([[-0.89905323, -0.29817837,  0.55968604, ...,  0.        ,
         0.        ,  0.        ],
       [-1.48356528,  1.51142969,  0.06275283, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.8544829 ,  0.15422364,  0.06275283, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.48356528, -0.07197737,  0.06275283, ...,  0.        ,
         0.        ,  0.        ],
       [-1.19130926,  1.28522868,  0.55968604, ...,  0.        ,
         0.        ,  0.        ],
       [-0.89905323,  0.38042465,  0.06275283, ...,  0.        ,
         0.        ,  0.        ]])

In [15]:
model = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=100)

pipeline_XGBoost = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model),  
    ]
)

In [16]:
cv_results = cross_val_score(pipeline_XGBoost, X_train, y_train, cv=5, scoring='precision')

In [17]:
# Calcula la precisión promedio en el cross validarte
precision_mean = cv_results.mean()

precision_mean

0.909712918660287

In [18]:
pipeline_XGBoost.fit(X_train, y_train)


In [19]:
y_pred = pipeline_XGBoost.predict(X_test)


In [20]:
# Calcula la precisión en el conjunto de prueba
precision = precision_score(y_test, y_pred)
precision


0.8604651162790697

In [22]:
import pickle

# Export Pipeline as pickle file

with open("modelos/pipeline_XGBoost_1.pkl", "wb") as file:
    pickle.dump(pipeline_XGBoost, file)

#with open("modelos/pipeline_preproc.pkl", "wb") as file:
#    pickle.dump(preprocessor, file)

