In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import pickle

## Load Data

In [2]:
filename_data = "data/aggregated.csv"
sample_fraction = 0.01

In [3]:
data = pd.read_csv(filename_data)
data = data.sample(frac=sample_fraction)
# print(data.isnull().sum())
print data.shape

(51294, 14)


In [4]:
filename_pickle = 'model/final_model.pkl'

## Preprocess Data

In [5]:
data = data.replace(np.NaN, 0)
# print(data.isnull().sum())

In [6]:
data['DepHr'] = data['CRS_DEP_TIME']//100

In [7]:
X_cols = ['MONTH','DAY_OF_WEEK','DepHr','UNIQUE_CARRIER','FL_NUM','ORIGIN','DEST','DISTANCE']
y_col = 'ARR_DEL15'
categorical_cols = X_cols[0:-1]
continous_cols = list(set(X_cols) - set(categorical_cols))

In [8]:
pca = PCA()
kbest = SelectKBest()
combined_features = FeatureUnion([("pca", pca),("select", kbest)])

In [9]:
data[X_cols].head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DepHr,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,DISTANCE
3225313,7.0,2.0,9.0,EV,4233.0,BTV,EWR,266.0
4677267,10.0,7.0,8.0,UA,1207.0,SNA,EWR,2434.0
494628,11.0,5.0,21.0,UA,1869.0,SFO,EWR,2565.0
3852135,9.0,1.0,17.0,B6,87.0,ALB,FLL,1204.0
5013152,1.0,4.0,16.0,AA,1979.0,BNA,CLT,328.0


## Pipeline Modeling with GridSearch Cross-validation

In [10]:
cv = 3
scoring = 'accuracy'

In [11]:
mapper = DataFrameMapper(
    [(categorical_col, LabelBinarizer()) for categorical_col in categorical_cols] +
    [(continous_col, StandardScaler()) for continous_col in continous_cols]
)

In [12]:
def pipelining ():
    clf_pipeline = Pipeline([
        ('mapper', mapper),
        ('features', combined_features),
        ('classifier', clf)
    ])
    clf_cv = GridSearchCV(clf_pipeline, cv=cv, param_grid=clf_parameters_grid, 
                                                  scoring=scoring
                                                 )
    clf_cv.fit(data[X_cols], data[y_col])
    return clf_cv

### 1. Logistic Regression (Ridge, Lasso)

In [13]:
clf = LogisticRegression()
clf_parameters_grid = {
    'features__pca__n_components':[2,3], 
    'features__select__k':[1,2], 
    'classifier__penalty':['l2']
}

In [14]:
clf_cv = pipelining()



In [15]:
print (clf_cv.best_params_)
print (clf_cv.best_score_)

{'features__pca__n_components': 2, 'features__select__k': 1, 'classifier__penalty': 'l2'}
0.82138261785


### 1.1 Save as final model

In [16]:
final_model = clf_cv.best_estimator_
with open(filename_pickle, 'wb') as handle:
    pickle.dump(final_model, handle)

### 2. Elastic Net

In [17]:
clf = SGDClassifier(loss='log',penalty='elasticnet')
clf_parameters_grid = {
#     'features__pca__n_components':[2], 
#     'features__select__k':[1], 

}

In [18]:
clf_cv = pipelining()



KeyboardInterrupt: 

In [19]:
print (clf_cv.best_params_)
print (clf_cv.best_score_)

{'features__pca__n_components': 2, 'features__select__k': 1, 'classifier__penalty': 'l2'}
0.82138261785


### 3. Random Forest Classifier

In [20]:
clf = RandomForestClassifier()
clf_parameters_grid = {
    'features__pca__n_components':[2], 
    'features__select__k':[1], 
    'classifier__max_depth':[20,50]
}

In [21]:
clf_cv = pipelining()



In [22]:
print (clf_cv.best_params_)
print (clf_cv.best_score_)

{'features__pca__n_components': 2, 'features__select__k': 1, 'classifier__max_depth': 20}
0.815787421531


### 4. AdaBoost Classifier

In [23]:
clf = AdaBoostClassifier()
clf_parameters_grid = {
    'features__pca__n_components':[2], 
    'features__select__k':[1], 
    
}

In [24]:
clf_cv = pipelining()



In [25]:
print (clf_cv.best_params_)
print (clf_cv.best_score_)

{'features__pca__n_components': 2, 'features__select__k': 1}
0.821207158732


### 5. Support Vector Machine Classifier

In [26]:
clf = SVC()
clf_parameters_grid = {
    'features__pca__n_components':[2], 
    'features__select__k':[1], 
    
}

In [27]:
clf_cv = pipelining()



KeyboardInterrupt: 

In [28]:
print (clf_cv.best_params_)
print (clf_cv.best_score_)

{'features__pca__n_components': 2, 'features__select__k': 1}
0.821207158732
