In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
%matplotlib inline



In [2]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)
 
def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
    T([1, 2, 3], [10, 20, 30], 0.111111, 1)
    T([1, 2, 3], [30, 20, 10], -0.111111, -1)
    T([1, 2, 3], [0, 0, 0], -0.111111, -1)
    T([3, 2, 1], [0, 0, 0], 0.111111, 1)
    T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
    T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
    T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
    T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
       0.6)
    T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
    T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
       -0.333333)

Read data

In [3]:
X_train= np.load(open('../wp004/train_matrix.bin','rb'))
y_train= np.load(open('../wp004/train_labels.bin','rb'))

In [4]:
X_test= np.load(open('../wp004/test_matrix.bin','rb'))
y_test= np.load(open('../wp004/test_labels.bin','rb'))

Run vanilla random forest

In [5]:
import sklearn.ensemble
from datetime import datetime,timedelta

In [6]:
tc= datetime.now()
rf= sklearn.ensemble.RandomForestClassifier(random_state=1,
                                            max_features='sqrt',
                                            class_weight='balanced_subsample',
                                            n_estimators=500,
                                            min_samples_leaf=100
                                            )
rf.fit(X_train,y_train)
print 'Training took: ',(datetime.now() - tc).total_seconds(),' seconds'

Training took:  658.557316  seconds


In [7]:
prob= rf.predict_proba(X_test)

In [8]:
import sklearn.metrics
print 'AUC= ',sklearn.metrics.roc_auc_score(y_test,prob[:,1])

AUC=  0.637220238233


In [9]:
2.*sklearn.metrics.roc_auc_score(y_test,prob[:,1])-1.

0.2744404764669417

In [10]:
gini_normalized(y_test,prob[:,1])

0.27444047869336669

In [11]:
import pickle as pkl
pkl.dump(rf,open('rf.pkl','wb'))