In [1]:
import pandas as pd
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
import lightgbm as lgb



In [2]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
     return gini(a, p) / gini(a, a)
    
def gini_score(estimator, train, target):
    prediction = estimator.predict_proba(train)
    return gini(target, prediction[:, 1]) / gini(target, target)

def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
        T([1, 2, 3], [10, 20, 30], 0.111111, 1)
        T([1, 2, 3], [30, 20, 10], -0.111111, -1)
        T([1, 2, 3], [0, 0, 0], -0.111111, -1)
        T([3, 2, 1], [0, 0, 0], 0.111111, 1)
        T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
        T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
        T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
        T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
           0.6)
        T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
        T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
           -0.333333)

In [3]:
def loadData(path):
    df = pd.read_csv(path)
    return df

In [4]:
def extractTarget(dataFrame):
    arr = np.array(dataFrame, dtype=np.float)
    Ytarget = dataFrame["target"].values
    del dataFrame["target"]
    return Ytarget

In [5]:
def extractTrain(dataFrame):
    try:
        del dataFrame["id"]
    except:
        pass
    data = dataFrame.values
    return data

In [6]:
def logisticRegression(train, target, folds):
    model = LogisticRegression()
    predictions = cross_val_predict(model, train, target, cv = folds, method = 'predict_proba')
    return predictions

In [7]:
def logisticPredict(train, target, test):
    model = LogisticRegression()
    model.fit(train, target)
    predictions = model.predict_proba(test)
    return predictions

In [8]:
def lgboostPredict(train, target, test):
    model = lgb.LGBMClassifier(n_estimators=100, max_depth=5, num_leaves=32)
    model.fit(train, target)
    predictions = model.predict_proba(test)
    return predictions

In [9]:
def lgbboostGridSearch(train, target):
    model = lgb.LGBMClassifier()
    parameters = {
        #'nthread':[4], #when use hyperthread, xgboost may become slower
        'objective':['binary'],
        'learning_rate': [0.05], #so called `eta` value
        'max_depth': [10],
        #'min_child_weight': [11],
        #'silent': [1],
        #'subsample': [0.5, 0.8],
        #'colsample_bytree': [0.5, 0.7],
        'n_estimators': [100, 200, 300], #number of trees, change it to 1000 for better results
        #'missing':[-1],
        #'seed': [1337]
    }
    
    clf = GridSearchCV(model, parameters, n_jobs=1, cv=3,
                       scoring=gini_score,
                       verbose=2, refit=True)
    clf.fit(train, target)
    return clf

In [10]:
def xgboostPredict(train, target, test):
    model = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
      gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3,
      min_child_weight=1, missing=-1, n_estimators=500, nthread=-1,
      objective='binary:logistic', reg_alpha=0, reg_lambda=1,
      scale_pos_weight=1, seed=0, silent=True, subsample=1.0)
    model.fit(train, target)
    predictions = model.predict_proba(test)
    return predictions

In [11]:
def xgboostGridSearch(train, target):
    xgb_model = XGBClassifier()
    parameters = {
        #'nthread':[4], #when use hyperthread, xgboost may become slower
        'objective':['binary:logistic'],
        #'learning_rate': [0.05], #so called `eta` value
        'max_depth': [7],
        #'min_child_weight': [11],
        #'silent': [1],
        #'subsample': [0.5, 0.8],
        #'colsample_bytree': [0.5, 0.7],
        'n_estimators': [500], #number of trees, change it to 1000 for better results
        'missing':[-1],
        #'seed': [1337]
    }
    
    clf = GridSearchCV(xgb_model, parameters, n_jobs=3, cv=3,
                       scoring=gini_score,
                       verbose=2, refit=True)
    clf.fit(train, target)
    return clf

In [12]:
def catBinExtractor(headers):
    catBinHeaders = []
    for header in headers:
        if header.endswith("_cat") or header.endswith("_bin"):
            catBinHeaders.append(header)
    return catBinHeaders

In [13]:
def missingValCounter(df):
    count = (df == -1).astype(int).sum(axis=1)
    return count

In [14]:
def sumNans(row):
    unique, counts = np.unique(row, return_counts=True)
    nums = dict(zip(unique, counts))
    try:
        if nums[-1] > 0:
            row["new_column" + nums[-1]] = 1
    except:
        pass
    return row

In [15]:
def imputer(array):
    imp = Imputer(missing_values=-1, strategy='mean', axis=0)
    newArray = imp.fit_transform(array)
    return newArray

In [16]:
def runPca(array, components):
    pca = PCA(n_components = components, svd_solver='full')
    return pca.fit(array)

In [17]:
def dataNormalizer(array):
    normalizer = Normalizer(norm='max')
    normArray = normalizer.fit_transform(array)
    return normArray

In [18]:
def generate_submission(ids, predictions):
    with open('submission.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['id', 'target'])
        zipped = zip(ids, predictions[:, 1])
        for id_, prediction in zipped:
            writer.writerow([id_, prediction])

In [52]:
dataFrame = loadData("data/train.csv")
testData = loadData("data/test.csv")

In [53]:
#col_to_drop = dataFrame.columns[dataFrame.columns.str.startswith('ps_calc_')]
missVals = missingValCounter(dataFrame)
#low_variance_col = ['ps_car_11_cat', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin',
#                   'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin',
#                   'ps_car_09_cat']
#dataFrame = dataFrame.drop(col_to_drop, axis=1)
#testData = testData.drop(col_to_drop, axis=1)
#dataFrame = dataFrame.drop(low_variance_col, axis=1)
#testData = testData.drop(low_variance_col, axis=1)
Ytarget = extractTarget(dataFrame)
headers = dataFrame.columns
testHeaders = dataFrame.columns
catBinHeaders = catBinExtractor(headers)
catTestHeaders = catBinExtractor(testHeaders)

In [54]:
missVals = missVals.to_frame("missingVals")

In [55]:
dataFrame["missing_values"] = missVals["missingVals"].values

In [56]:
dfCatBin = dataFrame[catBinHeaders]
dfCatBin = dfCatBin + 1
dataFrame = dataFrame.drop(catBinHeaders, axis = 1)

dfTestCat = testData[catTestHeaders]
dfTestCat = dfTestCat + 1
testData = testData.drop(catTestHeaders, axis = 1)
del testData["id"]

encTest = OneHotEncoder()
encTest.fit(dfTestCat)
onehottest = encTest.transform(dfTestCat).toarray()


enc = OneHotEncoder()
enc.fit(dfCatBin)
onehotlabels = enc.transform(dfCatBin).toarray()

In [24]:
Xtest = extractTrain(testData)
Xtrain = extractTrain(dataFrame)
#Xtrain = np.concatenate((Xtrain, onehotlabels), axis = 1)
Xtrain = np.concatenate((Xtrain, missVals), axis = 1)
#Xtest = np.concatenate((Xtest, onehottest), axis = 1)

In [41]:
onehotlabels.shape

(595212, 218)

In [32]:
pca = runPca(Xtrain, 25)
print("Explained variance %.4f" % pca.explained_variance_ratio_.sum())
for i in range(25):
    print(pca.explained_variance_ratio_[i])

Explained variance 0.9970
0.906397574515
0.0253980389875
0.0110240998928
0.0070165354004
0.00627419790354
0.00614842740127
0.00536241854982
0.00452606876111
0.00364198718603
0.00296668223073
0.00238894160942
0.00198909813002
0.00177203038989
0.00166405484407
0.0015086610523
0.00148066687783
0.00129310777584
0.00120341774406
0.00107118338558
0.00103799623357
0.000830835725569
0.000598643204052
0.000566512651047
0.000449625730455
0.000384836999848


In [46]:
pca_cat = runPca(onehotlabels, 150)
print("Explained variance %.4f" % pca_cat.explained_variance_ratio_.sum())
for i in range(150):
    print(pca_cat.explained_variance_ratio_[i])

Explained variance 0.9960
0.0813072596784
0.0586568170944
0.0561106867699
0.048880190689
0.0446771621885
0.0427780961853
0.0422468686523
0.0410852778876
0.0374792905394
0.0370161926462
0.0326607385159
0.0298292542959
0.0279053124558
0.0267468934192
0.0234864878181
0.0231607934926
0.0224932459304
0.0217751041845
0.0195875344022
0.0194262164663
0.0183668567507
0.0154791246529
0.0118916216606
0.0113977694935
0.0111197439006
0.010102051363
0.00933582747357
0.0083468117595
0.0079322678965
0.00657588441681
0.00609571390326
0.00559150817093
0.00539022499501
0.00462924196351
0.00454320505758
0.00440567646228
0.00382511208596
0.00367339352709
0.00346800207793
0.00343216481119
0.00316239977817
0.00303083010524
0.00298422879415
0.00293669975045
0.00285686520643
0.00280610313349
0.0026632629605
0.00264118233849
0.00254854602287
0.00232409415655
0.00228160961272
0.0021231760015
0.00170643521098
0.00168912865236
0.00166263093
0.00164765817853
0.00163697501975
0.00157295765157
0.0015353680932
0.00148

In [21]:
#pca1 = runPca(Xtrain)
#print(pca1.explained_variance_ratio_)
#print(pca1.singular_values_)
Xtrain = imputer(Xtrain)
Xtest = imputer(Xtest)
#pca2 = runPca(Xtrain)
#print(pca2.explained_variance_ratio_)
#for eigenval in pca1.explained_variance_ratio_:
#    if eigenval < 1e-34:
#        print(eigenval)
Xtrain = dataNormalizer(Xtrain)
Xtest = dataNormalizer(Xtest)

In [39]:
predictionsLR = logisticRegression(Xtrain, Ytarget, 3)

(595212,)
(892816,)


array([[ 0.94883772,  0.05116228],
       [ 0.97089086,  0.02910914],
       [ 0.97509733,  0.02490267],
       ..., 
       [ 0.97788105,  0.02211895],
       [ 0.96640751,  0.03359249],
       [ 0.97680463,  0.02319537]])

In [121]:
predictionsGrid = xgboostGridSearch(Xtrain, Ytarget)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=500, max_depth=7, missing=-1, objective=binary:logistic 
[CV] n_estimators=500, max_depth=7, missing=-1, objective=binary:logistic 
[CV] n_estimators=500, max_depth=7, missing=-1, objective=binary:logistic 


Process ForkPoolWorker-18:
Process ForkPoolWorker-19:
Process ForkPoolWorker-17:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/pool.py", lin

KeyboardInterrupt: 

In [113]:
predictionsGridLgb = lgbboostGridSearch(Xtrain, Ytarget)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] max_depth=10, n_estimators=100, objective=binary, learning_rate=0.05 
[CV]  max_depth=10, n_estimators=100, objective=binary, learning_rate=0.05 -   5.1s
[CV] max_depth=10, n_estimators=100, objective=binary, learning_rate=0.05 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s


[CV]  max_depth=10, n_estimators=100, objective=binary, learning_rate=0.05 -   5.1s
[CV] max_depth=10, n_estimators=100, objective=binary, learning_rate=0.05 
[CV]  max_depth=10, n_estimators=100, objective=binary, learning_rate=0.05 -   4.8s
[CV] max_depth=10, n_estimators=200, objective=binary, learning_rate=0.05 
[CV]  max_depth=10, n_estimators=200, objective=binary, learning_rate=0.05 -   9.1s
[CV] max_depth=10, n_estimators=200, objective=binary, learning_rate=0.05 
[CV]  max_depth=10, n_estimators=200, objective=binary, learning_rate=0.05 -   8.1s
[CV] max_depth=10, n_estimators=200, objective=binary, learning_rate=0.05 
[CV]  max_depth=10, n_estimators=200, objective=binary, learning_rate=0.05 -   8.3s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   40.6s finished


In [114]:
print(predictionsGridLgb.best_score_)
print(predictionsGridLgb.best_estimator_)

0.27885995433202765
LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.05,
        max_bin=255, max_depth=10, min_child_samples=10,
        min_child_weight=5, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=0,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=50000, subsample_freq=1)


In [128]:
predictionsXG = xgboostPredict(Xtrain, Ytarget, Xtest)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243']
expected f244, f245 in input data

In [49]:
predictionsXG.shape

(892816, 2)

In [50]:
gini_normalized(Ytarget, predictionsXG[:, 1])

AssertionError: 

In [81]:
#pca1 = runPca(Xtrain)

In [44]:
#gini_normalized(Ytarget, predictionsLR[:, 1])
predictionsGrid = xgboostGridSearch(shortTrain, shortTest)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] max_depth=1 .....................................................
[CV] max_depth=1 .....................................................
[CV] max_depth=1 .....................................................
[CV] max_depth=2 .....................................................
[CV] max_depth=2 .....................................................
[CV] ............................................ max_depth=1 -  27.0s
[CV] max_depth=2 .....................................................
[CV] ............................................ max_depth=1 -  29.5s
[CV] ............................................ max_depth=1 -  30.0s
[CV] ............................................ max_depth=2 -  35.2s
[CV] ............................................ max_depth=2 -  41.6s
[CV] ............................................ max_depth=2 -  15.5s


[Parallel(n_jobs=5)]: Done   6 out of   6 | elapsed:   42.8s finished


In [38]:
#gini_normalized(Ytarget, predictionsXG[:, 1])
print(Ytarget.shape)
print(predictionsXG.shape)

(595212,)
(892816, 2)


In [None]:
dataFrame

In [51]:
ids = loadData("data/test.csv")
ids = ids["id"]
generate_submission(ids, predictionsXG)