In [1]:
import numpy as np
import pandas as pd

from sklearn import cross_validation as CV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import accuracy_score as accuracy

from time import ctime

#

train_file = '~/Documents/repos/chimera/notebooks/00000003/data/train.csv'
test_file = '~/Documents/repos/chimera/notebooks/00000003/data//test.csv'
output_file = '~/Documents/repos/chimera/notebooks/00000003/data/train_sorted.csv'

print "loading..."

train = pd.read_csv( train_file )
test = pd.read_csv( test_file )

test.drop( 't_id', axis = 1, inplace = True )
test['target'] = 0 # dummy for preserving column order when concatenating

train['is_test'] = 0
test['is_test'] = 1

orig_train = train.copy()
assert( np.all( orig_train.columns == test.columns ))

train = pd.concat(( orig_train, test ))
train.reset_index( inplace = True, drop = True )

x = train.drop( [ 'is_test', 'target' ], axis = 1 )
y = train.is_test

#

print "cross-validating..."

n_estimators = 100
clf = RF( n_estimators = n_estimators, n_jobs = -1 )

predictions = np.zeros( y.shape )

cv = CV.StratifiedKFold( y, n_folds = 5, shuffle = True, random_state = 5678 )

for f, ( train_i, test_i ) in enumerate( cv ):

    print "# fold {}, {}".format( f + 1, ctime())

    x_train = x.iloc[train_i]
    x_test = x.iloc[test_i]
    y_train = y.iloc[train_i]
    y_test = y.iloc[test_i]

    clf.fit( x_train, y_train )	

    p = clf.predict_proba( x_test )[:,1]

    auc = AUC( y_test, p )
    print "# AUC: {:.2%}\n".format( auc )

    predictions[ test_i ] = p

# fold 1
# AUC: 87.00%

# fold 2
# AUC: 86.87%

# fold 3
# AUC: 87.43%

# fold 4
# AUC: 86.83%

# fold 5
# AUC: 87.71%

train['p'] = predictions

i = predictions.argsort()
train_sorted = train.iloc[i]

# """
# print "predictions distribution for test"
# train_sorted.loc[ train_sorted.is_test == 1, 'p' ].hist()
# p_test_mean = train_sorted.loc[ train_sorted.is_test == 1, 'p' ].mean()
# p_test_std = train_sorted.loc[ train_sorted.is_test == 1, 'p' ].std()
# print "# mean: {}, std: {}".format( p_test_mean, p_test_std )
# # mean: 0.404749669062, std: 0.109116404564
# """

train_sorted = train_sorted.loc[ train_sorted.is_test == 0 ]
assert( train_sorted.target.sum() == orig_train.target.sum())

# """
# print "predictions distribution for train"
# p_train_mean = train_sorted.p.mean()
# p_train_std = train_sorted.p.std()
# print "# mean: {}, std: {}".format( p_train_mean, p_train_std )
# # mean: 0.293768613822, std: 0.113601453932
# """

train_sorted.drop( 'is_test', axis = 1, inplace = True )
train_sorted.to_csv( output_file, index = False )



loading...
cross-validating...
# fold 1, Sun Oct 23 20:32:02 2016
# AUC: 84.94%

# fold 2, Sun Oct 23 20:32:49 2016
# AUC: 84.70%

# fold 3, Sun Oct 23 20:33:34 2016
# AUC: 84.35%

# fold 4, Sun Oct 23 20:34:22 2016
# AUC: 84.29%

# fold 5, Sun Oct 23 20:35:07 2016
# AUC: 84.63%



In [2]:
# "Load sorted training set and validate on examples looking the most like test"

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression as LR

from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import log_loss

#

input_file = '~/Documents/repos/chimera/notebooks/00000003/data/train_sorted.csv'
val_size = 5000

#

def train_and_evaluate( y_train, x_train, y_val, x_val ):

    lr = LR()
    lr.fit( x_train, y_train )

    p = lr.predict_proba( x_val )
    p_bin = lr.predict( x_val )

    acc = accuracy( y_val, p_bin )
    auc = AUC( y_val, p[:,1] )
    ll = log_loss( y_val, p[:,1] )

    return ( auc, acc, ll )

def transform_train_and_evaluate( transformer ):

    global x_train, x_val, y_train

    x_train_new = transformer.fit_transform( x_train )
    x_val_new = transformer.transform( x_val )

    return train_and_evaluate( y_train, x_train_new, y_val, x_val_new )

#

print "loading..."

data = pd.read_csv( input_file )

train = data.iloc[:-val_size]
val = data.iloc[-val_size:]

# print len( train ), len( val )

# 

y_train = train.target.values
y_val = val.target.values

x_train = train.drop( 'target', axis = 1 )
x_val = val.drop( 'target', axis = 1 )

# train, predict, evaluate

auc, acc, ll = train_and_evaluate( y_train, x_train, y_val, x_val )

print "No transformation"
print "AUC: {:.2%}, accuracy: {:.2%}, log loss: {:.2%} \n".format( auc, acc, ll )

# try different transformations for X

transformers = [ MaxAbsScaler(), MinMaxScaler(), RobustScaler(), StandardScaler(),  
    Normalizer( norm = 'l1' ), Normalizer( norm = 'l2' ), Normalizer( norm = 'max' ) ]

poly_scaled = Pipeline([ ( 'poly', PolynomialFeatures()), ( 'scaler', MinMaxScaler()) ])

transformers += [ poly_scaled ]

for transformer in transformers:

    print transformer
    auc, acc, ll = transform_train_and_evaluate( transformer )
    print "AUC: {:.2%}, accuracy: {:.2%}, log loss: {:.2%} \n".format( auc, acc, ll )

# """
# No transformation
# AUC: 52.54%, accuracy: 51.96%, log loss: 69.22%
# MaxAbsScaler(copy=True)
# AUC: 52.54%, accuracy: 51.98%, log loss: 69.22%
# MinMaxScaler(copy=True, feature_range=(0, 1))
# AUC: 52.54%, accuracy: 51.98%, log loss: 69.22%
# RobustScaler(copy=True, with_centering=True, with_scaling=True)
# AUC: 52.54%, accuracy: 52.04%, log loss: 69.22%
# StandardScaler(copy=True, with_mean=True, with_std=True)
# AUC: 52.53%, accuracy: 52.04%, log loss: 69.22%
# Normalizer(copy=True, norm='l1')
# AUC: 52.30%, accuracy: 52.46%, log loss: 69.23%
# Normalizer(copy=True, norm='l2')
# AUC: 52.35%, accuracy: 51.08%, log loss: 69.24%
# Normalizer(copy=True, norm='max')
# AUC: 52.37%, accuracy: 52.20%, log loss: 69.24%
# Pipeline(steps=[('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=
# False)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1)))])
# AUC: 52.57%, accuracy: 51.76%, log loss: 69.58%
# """

loading...
No transformation
AUC: 52.33%, accuracy: 51.44%, log loss: 69.26% 

MaxAbsScaler(copy=True)
AUC: 52.33%, accuracy: 51.46%, log loss: 69.26% 

MinMaxScaler(copy=True, feature_range=(0, 1))
AUC: 52.33%, accuracy: 51.48%, log loss: 69.26% 

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)
AUC: 52.33%, accuracy: 51.44%, log loss: 69.26% 

StandardScaler(copy=True, with_mean=True, with_std=True)
AUC: 52.33%, accuracy: 51.40%, log loss: 69.26% 

Normalizer(copy=True, norm='l1')
AUC: 51.84%, accuracy: 51.00%, log loss: 69.32% 

Normalizer(copy=True, norm='l2')
AUC: 52.12%, accuracy: 51.18%, log loss: 69.32% 

Normalizer(copy=True, norm='max')
AUC: 52.35%, accuracy: 51.38%, log loss: 69.27% 

Pipeline(steps=[('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1)))])
AUC: 51.75%, accuracy: 50.78%, log loss: 69.52% 



In [3]:
# "Load data, train, output predictions"

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression as LR

train_file = '~/Documents/repos/chimera/notebooks/00000003/data/train.csv'
test_file = '~/Documents/repos/chimera/notebooks/00000003/data/test.csv'
lr_output_file = '~/Documents/repos/chimera/notebooks/00000003/data/predictions_lr.csv'
poly_output_file = '~/Documents/repos/chimera/notebooks/00000003/data/predictions_poly.csv'

#

print "loading..."

train = pd.read_csv( train_file )
test = pd.read_csv( test_file )

x_train = train.drop( 'target', axis = 1 )
y_train = train.target.values

x_test = test.drop( 't_id', axis = 1 )

print "training..."

lr = LR()
print lr
lr.fit( x_train, y_train )

poly = make_pipeline( PolynomialFeatures(), LR()) 
print poly
poly.fit( x_train, y_train )

print "predicting..."

p_lr = lr.predict_proba( x_test )
test['p_lr'] = p_lr[:,1]

p_poly = poly.predict_proba( x_test )
test['p_poly'] = p_poly[:,1]

print "saving..."

test.to_csv( lr_output_file, columns = ( 't_id', 'p_lr' ), header = ( 't_id', 'probability' ), index = None )
test.to_csv( poly_output_file, columns = ( 't_id', 'p_poly' ), header = ( 't_id', 'probability' ), index = None )

# LR:	0.69101
# Poly:	0.69229

loading...
training...
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
predicting...
saving...
