In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import matplotlib
from multiprocessing import cpu_count
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
import numpy as np
import pickle

In [3]:
mem = Memory("./mycache")
use_cpu = int(cpu_count() *3.0  / 4)
print "use cpu:", use_cpu
@mem.cache
def get_data(file_name):
    data = load_svmlight_file(file_name)
    return data[0], data[1]

print "loading train data"
train_X, train_y = get_data("../data/a4_smvl_trn")
print "loading validation data"
validation_X, validation_y = get_data("../data/a4_smvl_val")
print "loading test data"
test_X, test_y = get_data("../data/a4_smvl_tst")
print "loading data finished"

use cpu: 27
loading train data
loading validation data
loading test data
loading data finished


In [4]:
def train_clf_by_batches(clf, train_X, train_y):
    start = 0
    batch_size = train_X.shape[0]/100
    seg_idx = 0
    while start < train_X.shape[0]:
        if seg_idx % 10 == 0:
            print "doing seg: ", seg_idx
        seg_idx += 1
        batch_X, batch_y = train_X[start:start+batch_size], train_y[start:start+batch_size]
        clf.fit(batch_X, batch_y)
        start += batch_size

In [None]:
print "train Logistic regression on train data"
LR_clf = LogisticRegression(n_jobs = use_cpu)
train_clf_by_batches(LR_clf, train_X, train_y)
with open("LR_param.dat", "w") as f:
    pickle.dump((LR_clf.coef_, LR_clf.intercept_), f)

In [None]:
result = LR_clf.predict(test_X)
acc = np.sum([1 if y_truth == y_predict else 0 for y_truth, y_predict in zip(test_y, result)]).astype("float32")/len(test_y)
print "Accuracy: ", acc
probas = LR_clf.predict_proba(test_X)
lost = log_loss(test_y, probas)
print 'Lost', lost

In [None]:
print "testing...."
test_labels = LR_clf.predict(test_X)
test_probs = LR_clf.predict_proba(test_X)
print "plotting ROC"
print test_labels.shape, test_probs[:,1].shape
tpr, fpr, th = roc_curve(test_labels, test_probs[:,0])
print tpr.shape
print fpr.shape
#print zip(tpr[:], fpr[:])
plt.plot( fpr[:], tpr[:], color = 'r',linewidth=5.0)
print "testing finished"

In [None]:
max_acc = 0
best_clf = None
max_C = 0
c_pool = [0.01, 0.1, 0.5, 1.0, 10.0]
for c in c_pool:
    print "training under c=", c
    tmp_LR_clf = LogisticRegression(C=c, n_jobs = use_cpu)
    train_clf_by_batches(tmp_LR_clf, train_X, train_y)
    tmp_test_labels = tmp_LR_clf.predict(validation_X)
    tmp_acc = np.sum([1 if y_truth == y_predict else 0 for y_truth, y_predict in zip(validation_y, tmp_test_labels)])*1.0/len(validation_y)
    
    if tmp_acc > max_acc:
        best_clf = tmp_LR_clf
        max_acc = tmp_acc
        max_C = c
print "validation finished, with the highest acc: ", max_acc, " with C= ", max_C
with open("best_clf_param.dat", "w") as f:
    pickle.dump((best_clf.coef_, best_clf.intercept_), f)

In [None]:
clf_cv = CalibratedClassifierCV(LR_clf, cv=5, method='isotonic')
train_clf_by_batches(clf_cv, train_X, train_y)
result_cv = clf_cv.predict(test_X)
acc = np.sum([1 if y_truth == y_predict else 0 for y_truth, y_predict in zip(test_y, result_cv)]).astype("float32")/len(test_y)
print "Accuracy: ", acc
probas_cv = clf_cv.predict_proba(test_X)
cv_score = log_loss(test_y, probas_cv)
print 'calibrated score (5-fold:)', cv_score
with open("cv_clf_param.dat", "w") as f:
    pickle.dump((cv_clf.coef_, cv_clf.intercept_), f)