In [177]:
########################################
## HACKATHON 1
########################################

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingClassifier,
    GradientBoostingRegressor,
)
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

In [78]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.info()
#test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419487 entries, 0 to 419486
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID                     419487 non-null  int64  
 1   DATE_DEPARTURE_UTC     419487 non-null  object 
 2   DATE_ARRIVAL_UTC       419487 non-null  object 
 3   DATE_DEPARTURE_LCL     419487 non-null  object 
 4   DATE_ARRIVAL_LCL       419487 non-null  object 
 5   ORIGIN                 419487 non-null  object 
 6   ORIGIN_AIRPORT_ID      419487 non-null  int64  
 7   DEST                   419487 non-null  object 
 8   DEST_AIRPORT_ID        419487 non-null  int64  
 9   TAIL_NUM               419487 non-null  object 
 10  OP_CARRIER             419487 non-null  object 
 11  OP_CARRIER_AIRLINE_ID  419487 non-null  int64  
 12  OP_CARRIER_FL_NUM      419487 non-null  int64  
 13  DISTANCE               401869 non-null  float64
 14  CANCELLED              419487 non-nu

In [22]:
#analizing the variables
train.describe()

#quantity of missing per feature - DISTANCE, DEP_DEL15
print(train.isnull().sum())

#categories per feature
train.nunique()

ID                            0
DATE_DEPARTURE_UTC            0
DATE_ARRIVAL_UTC              0
DATE_DEPARTURE_LCL            0
DATE_ARRIVAL_LCL              0
ORIGIN                        0
ORIGIN_AIRPORT_ID             0
DEST                          0
DEST_AIRPORT_ID               0
TAIL_NUM                      0
OP_CARRIER                    0
OP_CARRIER_AIRLINE_ID         0
OP_CARRIER_FL_NUM             0
DISTANCE                  17618
CANCELLED                     0
DEP_DEL15                146820
ARR_DEL15                     0
dtype: int64


ID                       419487
DATE_DEPARTURE_UTC        29661
DATE_ARRIVAL_UTC          30137
DATE_DEPARTURE_LCL        26687
DATE_ARRIVAL_LCL          28477
ORIGIN                      351
ORIGIN_AIRPORT_ID           351
DEST                        350
DEST_AIRPORT_ID             350
TAIL_NUM                   5376
OP_CARRIER                   17
OP_CARRIER_AIRLINE_ID        17
OP_CARRIER_FL_NUM          6715
DISTANCE                   1468
CANCELLED                     1
DEP_DEL15                     2
ARR_DEL15                     2
dtype: int64

In [199]:
#FEATURE ENGINEERING - Part 1

    #convert timestamp variables
    #create new features:
        #flag of holiday
        #flag day befor a holiday
        #day of the week
        #period of the day - morning/afternoon/night
        #flight duration = DATE_ARRIVAL_UTC - DATE_DEPARTURE_UTC
        #stardarized flight duration
    #transform OP_CARRIER in a categorical variable
    #drop of the analysis: CANCELLED, DATE_DEPARTURE_LCL, DATE_ARRIVAL_LCL, TAIL_NUM
    #deal with missing values: categorize DEP_DEL15 using missing as a category

def feature_engineering(_df):
    mod_df = _df.copy()
    
    mod_df.drop(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'CANCELLED','TAIL_NUM'], inplace = True, axis = 1)
    
    #convert timestamp variables
    mod_df['DATE_ARRIVAL_UTC_ts'] = pd.to_datetime(mod_df['DATE_ARRIVAL_UTC'])
    mod_df['DATE_DEPARTURE_UTC_ts'] = pd.to_datetime(mod_df['DATE_DEPARTURE_UTC'])
    mod_df['DATE_ARRIVAL_LCL_ts'] = pd.to_datetime(mod_df['DATE_ARRIVAL_LCL'])
    mod_df['DATE_DEPARTURE_LCL_ts'] = pd.to_datetime(mod_df['DATE_DEPARTURE_LCL'])
    
    #weekday
    mod_df['WeekDay_DEPARTURE_UTC'] = pd.to_datetime(mod_df['DATE_DEPARTURE_UTC_ts']).dt.weekday
    mod_df['WeekDay_ARRIVAL_UTC'] = pd.to_datetime(mod_df['DATE_ARRIVAL_UTC_ts']).dt.weekday
    
    #transform OP_CARRIER in a categorical variable
    mod_df['OP_CARRIER'] = mod_df['OP_CARRIER'].astype('category')
    
    #deal with missing values
    mod_df['DEP_DEL15_cat'] = mod_df['DEP_DEL15'].fillna(2).astype('category').map({1: 'delayed', 0: 'not delayed', 2: 'unknown'})
    mod_df = mod_df.join(pd.get_dummies(mod_df['DEP_DEL15_cat']))
    
    #duration of the flight
    mod_df['Duration_UTC'] = (mod_df['DATE_ARRIVAL_UTC_ts'] - mod_df['DATE_DEPARTURE_UTC_ts']).dt.seconds/60
    
    scaler = StandardScaler()
    mod_df['Duration_UTC_Standard'] = scaler.fit_transform(pd.DataFrame(mod_df['Duration_UTC']))

    #Federal holidays
    mod_df['NewYear_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('01-01').map({True: 1, False: 0})
    mod_df['MartinLutherKingDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('01-20').map({True: 1, False: 0})
    
    return mod_df
    

In [200]:
#FEATURE ENGINEERING - Part 2
    #deal with missing values: use median to DISTANCE
class ValueImputer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.fills = X.median(axis=0).squeeze()
        return self
        
    def transform(self, X):
        return pd.DataFrame(X).fillna(self.fills)

#FEATURE ENGINEERING - Part 3
    #stardarized DISTANCE
def Standard_Distance(_df, feature):
    mod_df = _df.copy()
    
    scaler = StandardScaler()
    mod_df['Distance_Standard'] = scaler.fit_transform(pd.DataFrame(mod_df[feature]))
    
    return mod_df


In [201]:
## APPLAYING THE PREVIOUS STEPS

train_target = train['ARR_DEL15']
train_adj = feature_engineering(train)

imputer = ValueImputer()
train_adj['DISTANCE_noNaN'] = imputer.fit_transform(train_adj['DISTANCE'])
train_adj = Standard_Distance(train_adj, 'DISTANCE_noNaN')

#Sorting to do split into train and test set
train_adj = train_adj.sort_values(by="DATE_DEPARTURE_UTC_ts")

## SPLIT TRAIN SET
X_train, X_test, y_train, y_test = train_test_split(train_adj, train_target, test_size=0.3, shuffle=False)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(293640, 29)
(293640,)
(125847, 29)
(125847,)


In [148]:
class SelectColumns(BaseEstimator,TransformerMixin):
    
    def __init__(self, cols=[]):
        self.cols = cols
        
    def fit(self, X=None, y=None, **fit_params):
        return self
    
    def transform(self, data):
        X = data.copy()
        X = X[self.cols]   
        return X
    
#select_columns(X_train, features_baseline)


In [171]:
##PIPELINE
    #select features to modeling
    #select model

features_baseline = ['NewYear_Flag', 'MartinLutherKingDay_Flag','not delayed', 'delayed', 'unknown', 
                     'Duration_UTC_Standard', 'Distance_Standard']

pipeline1 = Pipeline([("Select_Columns", SelectColumns(cols = features_baseline)),
                      ("Model", LogisticRegression())])

pipeline1.fit(X_train, y_train)
print('Train: ', pipeline1.score(X_train, y_train)) 
print('Test : ', pipeline1.score(X_test, y_test))

Train:  0.8519343413703855
Test :  0.8512717824024411


In [174]:
pipeline2 = Pipeline([("Select_Columns", SelectColumns(cols = features_baseline)),
                      ("Model", DecisionTreeClassifier())])

pipeline2.fit(X_train, y_train)
print('Train: ', pipeline2.score(X_train, y_train)) 
print('Test : ', pipeline2.score(X_test, y_test))

Train:  0.897309630840485
Test :  0.8066461655820163


In [175]:
pipeline3 = Pipeline([("Select_Columns", SelectColumns(cols = features_baseline)),
                      ("Model", RandomForestClassifier())])

pipeline3.fit(X_train, y_train)
print('Train: ', pipeline3.score(X_train, y_train)) 
print('Test : ', pipeline3.score(X_test, y_test))

Train:  0.89727898106525
Test :  0.8137579759549294


In [176]:
pipeline4 = Pipeline([("Select_Columns", SelectColumns(cols = features_baseline)),
                      ("Model", GradientBoostingClassifier())])

pipeline4.fit(X_train, y_train)
print('Train: ', pipeline4.score(X_train, y_train)) 
print('Test : ', pipeline4.score(X_test, y_test))

Train:  0.8519581800844571
Test :  0.8512717824024411


In [181]:
pipeline5 = Pipeline([("Select_Columns", SelectColumns(cols = features_baseline)),
                      ("Model", KNeighborsClassifier(n_neighbors=10))])

pipeline5.fit(X_train, y_train)
print('Train: ', pipeline5.score(X_train, y_train)) 
print('Test : ', pipeline5.score(X_test, y_test))

Train:  0.8522714888979703
Test :  0.8502705666404443


In [192]:
X_train.columns

Index(['ID', 'DATE_DEPARTURE_UTC', 'DATE_ARRIVAL_UTC', 'DATE_DEPARTURE_LCL',
       'DATE_ARRIVAL_LCL', 'ORIGIN', 'DEST', 'OP_CARRIER',
       'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'DISTANCE', 'DEP_DEL15',
       'ARR_DEL15', 'DATE_ARRIVAL_UTC_ts', 'DATE_DEPARTURE_UTC_ts',
       'DATE_ARRIVAL_LCL_ts', 'DATE_DEPARTURE_LCL_ts', 'WeekDay_DEPARTURE_UTC',
       'WeekDay_ARRIVAL_UTC', 'DEP_DEL15_cat', 'not delayed', 'delayed',
       'unknown', 'Duration_UTC', 'Duration_UTC_Standard', 'NewYear_Flag',
       'MartinLutherKingDay_Flag', 'DISTANCE_noNaN', 'Distance_Standard'],
      dtype='object')

In [211]:
from category_encoders import OneHotEncoder, TargetEncoder
ohe = OneHotEncoder(use_cat_names=True, handle_missing = 'indicator', handle_unknown='indicator')

ohe.fit(train["DEP_DEL15"])

x = ohe.transform(train["DEP_DEL15"])


In [232]:
o = OneHotEncoder(use_cat_names=True, handle_unknown='indicator')
X = train[['DEP_DEL15']].squeeze()
o.fit(X)
X_ohe = o.transform(X) #Universe_-1 colummn is create for unseen categories
X_ohe

Unnamed: 0,DEP_DEL15
0,0.0
1,
2,
3,0.0
4,
...,...
419482,0.0
419483,0.0
419484,0.0
419485,


In [236]:
ohe2 = OneHotEncoder(handle_unknown='indicator',handle_missing = 'indicator')
ohe2.fit_transform(X)

Unnamed: 0,DEP_DEL15
0,0.0
1,
2,
3,0.0
4,
...,...
419482,0.0
419483,0.0
419484,0.0
419485,


In [237]:
OneHotEncoder?