# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = 10, 6

In [2]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss, brier_score_loss, precision_score, recall_score, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [3]:
from time import time

In [4]:
import xgboost as xgb

# Data

In [5]:
X = pd.read_csv("data/train.data.csv")
y = pd.read_csv("data/train.labels.csv")
test_data = pd.read_csv("data/test.data.csv")

In [6]:
X = X.values
y = y.values.ravel()
test_data = test_data.values

In [7]:
X_scaled = preprocessing.scale(X)

# Models

## LDA & QDA

In [113]:
skf = StratifiedKFold(n_splits=5)
for train, test in skf.split(X, y):
    lda = LinearDiscriminantAnalysis()
    lda.fit(X[train,:], y[train])
    y_pred = lda.predict_proba(X[test,:])
    print(log_loss(y[test], y_pred))

0.17240538893180649
0.17850714284219815
0.16548403451670984
0.17610640403466382
0.16243686460127657


In [114]:
skf = StratifiedKFold(n_splits=5)
for train, test in skf.split(X, y):
    qda = QuadraticDiscriminantAnalysis(reg_param=0.01)
    qda.fit(X[train,:], y[train])
    y_pred = qda.predict_proba(X[test,:])
    print(log_loss(y[test], y_pred))

0.16961958036289376
0.15306646473014543
0.15122028501593165
0.1561026532556051
0.1351773363234211


## XGBoost

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=1000,
                       silent=False, objective='binary:logistic', n_jobs=4)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
print(log_loss(y_test, y_pred))

0.10298208615287861


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = xgb.XGBClassifier(n_jobs=1)
param_grid = {"max_depth": [3,5,6],
             "learning_rate": [0.1, 0.01],
             "n_estimators": [1000, 3000],
             "objective": ["binary:logistic"],
             "reg_lambda": [0, 0.001, 0.01]}
gs = GridSearchCV(clf, param_grid, scoring="neg_log_loss", cv=3, verbose=2, n_jobs=4)
gs.fit(X_train, y_train)
gs.score(X_test, y_test)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0 
[CV] learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0 
[CV] learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0 
[CV] learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0.001 
[CV]  learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0.001, total= 1.2min
[CV] learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0.001 
[CV]  learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0, total= 1.2min
[CV]  learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0, total= 1.2min
[CV] learning_rate=0.1, max_depth=3, n_estimators=1000, objective=binary:logistic, reg_lambda=0.001 
[CV]

KeyboardInterrupt: 

In [None]:
skf = StratifiedShuffleSplit(n_splits=3, test_size=0.1)
for train, test in skf.split(X, y):
    clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=1000,
                            silent=False, objective='binary:logistic', n_jobs=4)
    clf.fit(X[train,:], y[train])
    y_pred = clf.predict_proba(X[test,:])
    print(log_loss(y[test], y_pred))

# Calibration

In [180]:
def plot_calibration_curve(est, name, X_train, X_test, y_train, y_test, plot=True):
    """Plot calibration curve for est w/o and with calibration. """
    # Calibrated with isotonic calibration
    isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')

    # Calibrated with sigmoid calibration
    sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')

    # Logistic regression with no calibration as baseline
    lr = LogisticRegression(C=1., solver='lbfgs')

    if plot:
        fig = plt.figure(1, figsize=(10, 10))
        ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
        ax2 = plt.subplot2grid((3, 1), (2, 0))
        ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    best_logloss = 100
    
    for clf, name in [(lr, 'Logistic'),
                      (est, name),
                      (isotonic, name + ' + Isotonic'),
                      (sigmoid, name + ' + Sigmoid')]:
        t0 = time()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        t = time()
        if hasattr(clf, "predict_proba"):
            prob_pos = clf.predict_proba(X_test)[:, 1]
        else:  # use decision function
            prob_pos = clf.decision_function(X_test)
            prob_pos = \
                (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

        clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max())
        print("%s:" % name)
        print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
        print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
        logloss = log_loss(y_test, prob_pos)
        print("\tLog_loss: %1.3f" % logloss)
        print("\tTime: %1.2f\n" % (t-t0))
        
        if logloss < best_logloss:
            best_logloss = logloss
            best_clf = name
            best_time = t-t0

        fraction_of_positives, mean_predicted_value = \
            calibration_curve(y_test, prob_pos, n_bins=10)
        
        if plot:
            ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
                     label="%s (%1.3f)" % (name, clf_score))

            ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                     histtype="step", lw=2)

    if plot:
        ax1.set_ylabel("Fraction of positives")
        ax1.set_ylim([-0.05, 1.05])
        ax1.legend(loc="lower right")
        ax1.set_title('Calibration plots  (reliability curve)')

        ax2.set_xlabel("Mean predicted value")
        ax2.set_ylabel("Count")
        ax2.legend(loc="upper center", ncol=2)

        plt.tight_layout()
    
    return best_clf, best_logloss, t-t0

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
clf = xgb.XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=1000,
                       silent=False, objective='binary:logistic', n_jobs=4)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
print(log_loss(y_test, y_pred))

0.10843816575314781


In [201]:
cal = CalibratedClassifierCV(clf, method="sigmoid", cv=3)
cal.fit(X_train, y_train)
y_pred = cal.predict_proba(X_test)
print(log_loss(y_test, y_pred))

0.11870577046594617


In [203]:
best_model_raw = KNeighborsClassifier(n_neighbors=10, p=2, n_jobs=-1)
best_model = CalibratedClassifierCV(best_model_raw, cv=2, method='isotonic')

In [207]:
skf = StratifiedShuffleSplit(n_splits=3, test_size=0.1)
for train, test in skf.split(X, y):
    X_train = X[train,:]
    y_train = y[train]
    X_test = X[test,:]
    y_test = y[test]
    best_model_raw.fit(X_train, y_train)
    y_pred = best_model_raw.predict_proba(X_test)
    print(log_loss(y_test, y_pred))

0.09674013165354484
0.19575744578122165
0.15338676237061843


In [205]:
skf = StratifiedShuffleSplit(n_splits=3, test_size=0.1)
for train, test in skf.split(X, y):
    X_train = X[train,:]
    y_train = y[train]
    X_test = X[test,:]
    y_test = y[test]
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict_proba(X_test)
    print(log_loss(y_test, y_pred))

0.09508256646248019
0.07537304021232892
0.08749876300402494


# Submission

In [149]:
clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=1000,
                        silent=False, objective='binary:logistic', n_jobs=4)
clf.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=4, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

In [153]:
test_data_pred = clf.predict_proba(test_data)

In [161]:
test_data_pred[:,1]

array([4.8711992e-04, 9.2626095e-01, 3.6046476e-04, ..., 9.9979252e-01,
       9.9992740e-01, 9.3594182e-01], dtype=float32)

In [177]:
submission = pd.DataFrame({'Id':range(1,15001), 'ProbFemale':test_data_pred[:,1]})
submission = submission[['Id','ProbFemale']]
submission.to_csv("submission.csv",index=False)