In [24]:
import pandas as pd
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

In [25]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
     return gini(a, p) / gini(a, a)
    
def gini_score(estimator, train, target):
    prediction = estimator.predict_proba(train)
    return gini(target, prediction[:, 1]) / gini(target, target)

def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
        T([1, 2, 3], [10, 20, 30], 0.111111, 1)
        T([1, 2, 3], [30, 20, 10], -0.111111, -1)
        T([1, 2, 3], [0, 0, 0], -0.111111, -1)
        T([3, 2, 1], [0, 0, 0], 0.111111, 1)
        T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
        T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
        T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
        T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
           0.6)
        T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
        T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
           -0.333333)

In [26]:
def loadData(path):
    df = pd.read_csv(path)
    return df

In [27]:
def extractTarget(dataFrame):
    arr = np.array(dataFrame, dtype=np.float)
    Ytarget = dataFrame["target"].values
    return Ytarget

In [28]:
def extractTrain(dataFrame):
    data = dataFrame.values
    return data

In [29]:
def logisticRegression(train, target, folds):
    model = LogisticRegression()
    predictions = cross_val_predict(model, train, target, cv = folds, method = 'predict_proba')
    return predictions

In [30]:
def logisticPredict(train, target, test):
    model = LogisticRegression()
    model.fit(train, target)
    predictions = model.predict_proba(test)
    return predictions

In [31]:
def xgboostPredict(train, target, test):
    model = XGBClassifier(max_depth=8, )
    model.fit(train, target)
    predictions = model.predict_proba(test)
    return predictions

In [32]:
def xgboostGridSearch(train, target):
    xgb_model = XGBClassifier()
    parameters = {
        #'nthread':[4], #when use hyperthread, xgboost may become slower
        #'objective':['binary:logistic'],
        #'learning_rate': [0.05], #so called `eta` value
        'max_depth': [1, 2]
        #'min_child_weight': [11],
        #'silent': [1],
        #'subsample': [0.5, 0.8],
        #'colsample_bytree': [0.5, 0.7],
        #'n_estimators': [5], #number of trees, change it to 1000 for better results
        #'missing':[-999],
        #'seed': [1337]
    }
    
    clf = GridSearchCV(xgb_model, parameters, n_jobs=5, cv=3,
                       scoring=gini_score,
                       verbose=2, refit=True)
    clf.fit(train, target)
    return clf

In [33]:
def catBinExtractor(headers):
    catBinHeaders = []
    for header in headers:
        if header.endswith("_cat") or header.endswith("_bin"):
            catBinHeaders.append(header)
    return catBinHeaders

In [34]:
def missingValCounter(df):
    count = (df == -1).astype(int).sum(axis=0)
    return count

In [46]:
def sumNans(row):
    unique, counts = np.unique(row, return_counts=True)
    nums = dict(zip(unique, counts))
    try:
        if nums[-1] > 0:
            row["new_column" + nums[-1]] = 1
    except:
        pass
    return row

In [36]:
def imputer(array):
    imp = Imputer(missing_values=-1, strategy='mean', axis=0)
    newArray = imp.fit_transform(array)
    return newArray

In [37]:
def runPca(array):
    pca = PCA(n_components=array.shape[1])
    return pca.fit(array)

In [38]:
def dataNormalizer(array):
    normalizer = Normalizer(norm='max')
    normArray = normalizer.fit_transform(array)
    return normArray

In [39]:
def generate_submission(ids, predictions):
    with open('submission.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['id', 'target'])
        zipped = zip(ids, predictions[:, 1])
        for id_, prediction in zipped:
            writer.writerow([id_, prediction])

In [40]:
dataFrame = loadData("data/train.csv")
testData = loadData("data/test.csv")
Ytarget = extractTarget(dataFrame)
headers = dataFrame.columns
testHeaders = dataFrame.columns
catBinHeaders = catBinExtractor(headers)
catTestHeaders = catBinExtractor(testHeaders)

In [17]:
dfCatBin = dataFrame[catBinHeaders]
dfCatBin = dfCatBin + 1
dataFrame = dataFrame.drop(catBinHeaders, axis = 1)

dfTestCat = testData[catTestHeaders]
dfTestCat = dfTestCat + 1
testData = testData.drop(catTestHeaders, axis = 1)
del testData["id"]

Xtest = extractTrain(testData)
encTest = OneHotEncoder()
encTest.fit(dfTestCat)
onehottest = encTest.transform(dfTestCat).toarray()

del dataFrame["target"]
del dataFrame["id"]
Xtrain = extractTrain(dataFrame)

enc = OneHotEncoder()
enc.fit(dfCatBin)
onehotlabels = enc.transform(dfCatBin).toarray()

In [18]:
Xtrain = np.concatenate((Xtrain, onehotlabels), axis = 1)
Xtest = np.concatenate((Xtest, onehottest), axis = 1)

In [19]:
#pca1 = runPca(Xtrain)
#print(pca1.explained_variance_ratio_)
#print(pca1.singular_values_)
Xtrain = imputer(Xtrain)
Xtest = imputer(Xtest)
#pca2 = runPca(Xtrain)
#print(pca2.explained_variance_ratio_)
#for eigenval in pca1.explained_variance_ratio_:
#    if eigenval < 1e-34:
#        print(eigenval)
Xtrain = dataNormalizer(Xtrain)
Xtest = dataNormalizer(Xtest)

In [77]:
predictionsLR = logisticRegression(Xtrain, Ytarget, 3)

In [78]:
predictionsXG = xgboostPredict(Xtrain, Ytarget, Xtest)

In [44]:
#gini_normalized(Ytarget, predictionsLR[:, 1])
predictionsGrid = xgboostGridSearch(shortTrain, shortTest)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] max_depth=1 .....................................................
[CV] max_depth=1 .....................................................
[CV] max_depth=1 .....................................................
[CV] max_depth=2 .....................................................
[CV] max_depth=2 .....................................................
[CV] ............................................ max_depth=1 -  27.0s
[CV] max_depth=2 .....................................................
[CV] ............................................ max_depth=1 -  29.5s
[CV] ............................................ max_depth=1 -  30.0s
[CV] ............................................ max_depth=2 -  35.2s
[CV] ............................................ max_depth=2 -  41.6s
[CV] ............................................ max_depth=2 -  15.5s


[Parallel(n_jobs=5)]: Done   6 out of   6 | elapsed:   42.8s finished


In [None]:
#gini_normalized(Ytarget, predictionsXG[:, 1])

In [None]:
dataFrame

In [None]:
ids = loadData("data/test.csv")
ids = ids["id"]
generate_submission(ids, predictionsXG)