# Using scikit-learn calibration

In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier 
from sklearn.metrics import log_loss
from sklearn.calibration import CalibratedClassifierCV

# Import Data (Kaggle OTTO challenge)

In [22]:
X = pd.read_csv('../train.csv')
X = X.drop('id', axis=1)

# Extract target
# Encode it to make it manageable by ML algo
y = X.target.values
y = LabelEncoder().fit_transform(y)

# Remove target from train, else it's too easy ...
X = X.drop('target', axis=1)

X.head(5)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,6,1,5,0,0,1,...,22,0,1,2,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


# Split Train / Test

In [23]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.20, random_state=36)

# Train and apply a Random Forest (without calibration)

In [24]:
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
# we use a BaggingClassifier to make 5 predictions, and average
# beacause that's what CalibratedClassifierCV do behind the scene
# and we want to compare things fairly
clfbag = BaggingClassifier(clf, n_estimators=5)
clfbag.fit(Xtrain, ytrain)
ypreds = clfbag.predict_proba(Xtest)
print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)

0.60


# Train and apply a Random Forest (with calibration)

In [25]:
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
# in our case, 'isotonic' works better than default 'sigmoid'
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
calibrated_clf.fit(Xtrain, ytrain)
ypreds = calibrated_clf.predict_proba(Xtest)
print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)

0.49


# We highly improved performance with calibration !