In [19]:
# Import required packages 
from __future__ import division, print_function # Imports from __future__ since we're running Python 2
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random_state=0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
rng = np.random.RandomState(seed=random_state)
%matplotlib inline

In [20]:
path_data = os.path.join(os.getcwd(),'train_data_ongoing.csv')
flights_train = pd.read_csv(path_data, delimiter = ',')

In [21]:
path_data = os.path.join(os.getcwd(),'test_data_ongoing.csv')
flights_test = pd.read_csv(path_data, delimiter = ',')

In [22]:
flights_train.loc[flights_train['ARR_DELAY_GROUP'] > 0, 'ARR_DELAY_GROUP'] = 1
flights_test.loc[flights_test['ARR_DELAY_GROUP'] > 0, 'ARR_DELAY_GROUP'] = 1

In [23]:
X_train_full = flights_train.drop('ARR_DELAY_GROUP', axis=1).values.astype(np.float) # Training features
y_train_full = flights_train['ARR_DELAY_GROUP'].values # Training labels
X_test = flights_test.drop('ARR_DELAY_GROUP', axis=1).values.astype(np.float) # Training features
y_test = flights_test['ARR_DELAY_GROUP'].values # Training labels

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, 
                                                  test_size=0.2, random_state=random_state)

In [25]:
sc = StandardScaler().fit(X_train)
X_train_sc = sc.transform(X_train)
X_val_sc = sc.transform(X_val)
X_test_sc = sc.transform(X_test)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy.stats import mode # Computes the mode of a signal
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score, make_scorer
kappa_scorer = make_scorer(cohen_kappa_score)
names = ["Logistic Regression", "Nearest Neighbors",
         "Decision Tree", "Random Forest", 
         "Naive Bayes", "LDA", "QDA","Neural Net (Multi-layer perceptron)"]
classifiers = [
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    KNeighborsClassifier(n_neighbors=10),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=10, n_estimators=50,random_state=random_state),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    MLPClassifier(random_state=random_state)]
ca_score = {} # Classification accuracy
ce_score = {} # Cross-entropy
kc_score = {} #kappa score
print('Classification performance on validation set:')
for name, clf in zip(names, classifiers):
    clf.fit(X_train_sc, y_train)
    ca_score[name] = clf.score(X_val_sc, y_val)
    ce_score[name] = log_loss(y_val, clf.predict_proba(X_val_sc))
    kc_score[name] = cohen_kappa_score(y_val,clf.predict(X_val_sc))
    print ("{}, accuracy: {:.3f}, log-loss: {:.3f}, kappa score: {:.3f}".format(name, ca_score[name], ce_score[name], kc_score[name]))



Classification performance on validation set:
Logistic Regression, accuracy: 0.828, log-loss: 0.381, kappa score: 0.625
Nearest Neighbors, accuracy: 0.730, log-loss: 0.570, kappa score: 0.403
Decision Tree, accuracy: 0.823, log-loss: 0.642, kappa score: 0.601
Random Forest, accuracy: 0.817, log-loss: 0.433, kappa score: 0.582
Naive Bayes, accuracy: 0.634, log-loss: 12.455, kappa score: 0.349




LDA, accuracy: 0.827, log-loss: 0.392, kappa score: 0.621




QDA, accuracy: 0.659, log-loss: 11.561, kappa score: 0.261
Neural Net (Multi-layer perceptron), accuracy: 0.821, log-loss: 0.716, kappa score: 0.599


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(n_splits=3, shuffle=True, random_state=random_state)
svc = LogisticRegression(penalty='l2'
        ,fit_intercept=True
        ,solver='newton-cg'
        ,multi_class='multinomial'
        ,max_iter=4000)
parameters = {'C': np.logspace(-3,3,7)}
svc_clf = GridSearchCV(estimator=svc, cv=cv, param_grid=parameters, scoring='accuracy')
svc_clf.fit(X_train_sc, y_train)
print("Best setting of C parameter for Logistic Regression: {}".format(svc_clf.best_params_["C"]))
print("Best cross-validated score: {:.3f}".
      format(svc_clf.best_score_))
print("Classification accuracy on validation set: {:.3f}".format(svc_clf.score(X_val_sc,y_val)))

Best setting of C parameter for Logistic Regression: 0.1
Best cross-validated score: 0.831
Classification accuracy on validation set: 0.828


In [28]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
kappa_scorer = make_scorer(cohen_kappa_score)
names = ["Logistic Regression"]
classifiers = [LogisticRegression(penalty='l2'
        ,C=0.1
        ,fit_intercept=True
        ,solver='newton-cg'
        ,multi_class='multinomial'
        ,max_iter=4000)]
ca_score = {} # Classification accuracy
ce_score = {} # Cross-entropy
kc_score = {} #kappa score
print('Classification performance on validation set:')
for name, clf in zip(names, classifiers):
    clf.fit(X_train_sc, y_train)
    ca_score[name] = clf.score(X_val_sc, y_val)
    ce_score[name] = log_loss(y_val, clf.predict_proba(X_val_sc))
    kc_score[name] = cohen_kappa_score(y_val,clf.predict(X_val_sc))
    precision =  metrics.precision_score(y_val, clf.predict(X_val_sc), average='macro') 
    recall = metrics.recall_score(y_val, clf.predict(X_val_sc), average='macro')
    f1 =  metrics.f1_score(y_val, clf.predict(X_val_sc), average='weighted')
    print ("Validation {}, accuracy: {:.3f}, log-loss: {:.3f}, kappa score: {:.3f}".format(name, ca_score[name], ce_score[name], kc_score[name]))
    print ("Validation {},precision: {:.3f}, recall: {:.3f}, f1: {:.3f}".format(name, precision,recall,f1))
    test1 = clf.score(X_test_sc, y_test)
    test2 = log_loss(y_test, clf.predict_proba(X_test_sc))
    test3 = cohen_kappa_score(y_test,clf.predict(X_test_sc))
    print ("Test {}, accuracy: {:.3f}, log-loss: {:.3f}, kappa score: {:.3f}".format(name, test1, test2, test3))
    precision =  metrics.precision_score(y_test,clf.predict(X_test_sc), average='macro') 
    recall = metrics.recall_score(y_test,clf.predict(X_test_sc), average='macro')
    f1 =  metrics.f1_score(y_test,clf.predict(X_test_sc), average='weighted')
    print ("Test {},precision: {:.3f}, recall: {:.3f}, f1: {:.3f}".format(name, precision,recall,f1))
    


Classification performance on validation set:
Validation Logistic Regression, accuracy: 0.828, log-loss: 0.385, kappa score: 0.626
Validation Logistic Regression,precision: 0.805, recall: 0.825, f1: 0.831
Test Logistic Regression, accuracy: 0.832, log-loss: 0.378, kappa score: 0.633
Test Logistic Regression,precision: 0.809, recall: 0.828, f1: 0.835
