In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

import matplotlib.pylab as plt
%matplotlib inline


### Get train data

In [2]:
# for original data:
# load original features
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)
#trainDataFrame.drop(['ID'], axis=1, inplace=True)

#trainLabels = trainDataFrame['TARGET']
#trainFeatures = trainDataFrame.drop(['TARGET'], axis=1)

In [3]:
print trainDataFrame.shape # 308 = 306 features + 1 label + 1 ID

(76020, 308)


In [4]:
target = 'TARGET'
IDcol = 'ID'
predictors = [x for x in trainDataFrame.columns if x not in [target, IDcol]]


In [11]:
trn_small = trainDataFrame[:10000]

## Try RBR 

In [10]:
from RandomBitRegression import RBR

In [None]:
def importOrReload(module_name, *names):
    import sys

    if module_name in sys.modules:
        reload(sys.modules[module_name])
    else:
        __import__(module_name, fromlist=names)

    for name in names:
        globals()[name] = getattr(sys.modules[module_name], name)

# use instead of: from dfly_parser import parseMessages
importOrReload("RandomBitRegression", "RBR")

In [15]:
rbr = RBR(trn_small[predictors], random_state=1)

### generate_features() (or load features and schema from joblib)

In [16]:
rbr.generate_features(1000)

Generated 1000 features in 3.6 secs


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
5,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
6,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
7,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
8,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1


In [17]:
np.array_equal(rbr.features.index, trn_small[target].index)

True

In [18]:
print rbr.features.shape
print len(rbr.feature_schema)
print "features and schema are consistent: %s" % np.array_equal(rbr.features.columns, 
                                                                sorted(rbr.feature_schema.keys()))

(10000, 1000)
1000
features and schema are consistent: True


In [None]:
joblib.dump(rbr._features, './data/rbr/features_10000.joblib')

In [None]:
joblib.dump(rbr._feature_schema, './data/rbr/feature_schema_10000.joblib')

### or load features and schema from joblib

In [None]:
rbr = RBR(trainDataFrame[predictors], trainDataFrame[target], random_state=1)

In [None]:
_features = joblib.load('./data/rbr/features_10000.joblib')
#rbr._feature_schema = joblib.load('./data/rbr/feature_schema_10000.joblib')

## TRY w LogisticRegression

In [21]:
lr_cv = LogisticRegressionCV(solver='sag', cv=3, scoring="roc_auc", n_jobs=1, verbose=0)
lr_cv.fit(rbr.features, trn_small[target])

LogisticRegressionCV(Cs=10, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring='roc_auc', solver='sag', tol=0.0001,
           verbose=0)

In [22]:
lr_cv.scores_

{1: array([[ 0.69504826,  0.70574642,  0.72457865,  0.72929871,  0.71527538,
          0.70201311,  0.69466172,  0.69071713,  0.6866429 ,  0.68367545],
        [ 0.72585987,  0.73124075,  0.74507036,  0.75645874,  0.75196339,
          0.74472196,  0.73825959,  0.73287388,  0.730302  ,  0.72829869],
        [ 0.68971925,  0.71292667,  0.72970802,  0.73541552,  0.73177907,
          0.71864385,  0.70644011,  0.70201492,  0.69716149,  0.69405975]])}

In [23]:
lr_cv_orig = LogisticRegressionCV(solver='sag', cv=3, scoring="roc_auc", n_jobs=1, verbose=0)
lr_cv_orig.fit(trn_small[predictors], trn_small[target])

LogisticRegressionCV(Cs=10, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring='roc_auc', solver='sag', tol=0.0001,
           verbose=0)

In [24]:
lr_cv_orig.scores_

{1: array([[ 0.59299193,  0.59396187,  0.59425118,  0.59425118,  0.59456809,
          0.59456569,  0.59426318,  0.59426198,  0.59426078,  0.59426078],
        [ 0.60491367,  0.6054895 ,  0.60537821,  0.60448785,  0.60511449,
          0.60510965,  0.60511086,  0.60510844,  0.60510723,  0.60510602],
        [ 0.59722464,  0.59796379,  0.59844526,  0.59844889,  0.5985747 ,
          0.5985747 ,  0.59859526,  0.5990852 ,  0.59908762,  0.59908641]])}

## TRY w XGB

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
_features['TARGET'] = trainDataFrame[target]

In [None]:
predictors = [ x for x in _features.columns if x not in [target, IDcol] ]

In [None]:
# all data set causes memory outage, try w a subset:
train = _features[:15000]

In [None]:
# helper func to cross validate XGB models:
def modelfit(alg, dtrain, predictors, silent=True, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    show_progress = not silent
    alg.set_params(silent=silent)
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=show_progress)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target], eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)[0:50]
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    
    return alg

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb1, train.to_dense(), predictors)

## independent test from /libs/Online-Random-Bit-Regression-FTRL/rbr_experiments/rbro.py

In [None]:
# Coding up the algorithm from http://arxiv.org/abs/1501.02990 
# "Random Bits Regression: a Strong General Predictor for Big Data"

import random
from datetime import datetime

def create_var_subset(x,size=3):
	# (1) Randomly select a small subset of variables, e.g. x1, x3, x6.
	return random.sample([i for i in range(len(x))],min(size,len(x)))
	
def assign_weights(var_subset):
	# (2) Randomly assign weights to each selected variables. The weights 
	# are sampled from standard normal distribution, for example, 
	# w1, w3, w6~N(0,1)
	return [(random.random(),i) for i in var_subset]
	
def obtain_weighted_sum(x, weighted_var_subset):
	# (3) Obtain the weighted sum for each sample, for example
	# (w1*x1) + (w3*x3) + (w6*x6) = zi for the ith sample.
	weighted_sum = 0
	for w, i in weighted_var_subset:
		weighted_sum += w * x[i]
	return weighted_sum
	
def pick_random_threshold(weighted_sums):
	# (4) Randomly pick one zi from the n generated as the threshold T.
	return random.choice(weighted_sums)
	
def assign_bit(weighted_sum, threshold):
	# (5) Assign bits values to fk according to the threshold T
	# If zi >= T then 1 else 0
	if weighted_sum >= threshold:
		return 1
	else:
		return 0

def process(data, K=100, size=3):
	# The process is repeated K times.
	start = datetime.now()
	data_bits = []
	for k in range(K):
		var_subset = create_var_subset(data[0],size=size) # 1
		weighted_var_subset = assign_weights(var_subset) #2
		weighted_sums = []
		for x in data:
			weighted_sums.append(obtain_weighted_sum(x, weighted_var_subset)) # 3
			# The first feature is fixed to 1 to act as the interceptor. 
			if k == 0:
				data_bits.append([1])

		random_threshold = pick_random_threshold(weighted_sums) # 4 (Try picking multiple thresholds or entropy)

		for i, (x, data_bit) in enumerate(zip(data, data_bits)):
			data_bit.append( assign_bit(obtain_weighted_sum(x, weighted_var_subset),random_threshold) ) # 5

		if k % 1000 == 0:
			print k, datetime.now() - start
	return data_bits

random.seed(100)

from sklearn import datasets
data, y = datasets.load_digits().data, datasets.load_digits().target
data_list = [list(x) for x in data]

data_bits = process(data_list, 10000, 3) # We generate ~10^4-10^6 random binary intermediate features for each sample.
	
from sklearn import linear_model, ensemble, svm, neighbors, cross_validation
import numpy as np

# Select predictive intermediate features by regularized linear/logistic regression.

# KNN Classifier without intermediate features
start = datetime.now()
clf = neighbors.KNeighborsClassifier()
scores = cross_validation.cross_val_score(clf, data, y,cv=20)
print clf, np.array(data).shape
print scores
print scores.mean()
print datetime.now() - start
print

# KNN Classifier with intermediate features
start = datetime.now()
scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20)
print clf, np.array(data_bits).shape
print scores
print scores.mean()
print datetime.now() - start
print

# SGD Classifier without intermediate features
start = datetime.now()
clf = linear_model.SGDClassifier(loss="log", penalty="l2", n_iter=20, random_state=1, n_jobs=-1)
scores = cross_validation.cross_val_score(clf, data, y,cv=20)
print clf, np.array(data).shape
print scores
print scores.mean()
print datetime.now() - start
print

# SGD Classifier with intermediate features
start = datetime.now()
scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20)
print clf, np.array(data_bits).shape
print scores
print scores.mean()
print datetime.now() - start
print

# Logistic Regression without intermediate features
start = datetime.now()
clf = linear_model.LogisticRegression()
scores = cross_validation.cross_val_score(clf, data, y,cv=20)
print clf, np.array(data).shape
print scores
print scores.mean()
print datetime.now() - start
print

# Logistic Regression with intermediate features
start = datetime.now()
scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20)
print clf, np.array(data_bits).shape
print scores
print scores.mean()
print datetime.now() - start
print

# Standard RF without features
start = datetime.now()
clf = ensemble.ExtraTreesClassifier(n_estimators=500,random_state=1,n_jobs=-1)
scores = cross_validation.cross_val_score(clf, data, y, cv=20)
print clf, np.array(data).shape
print scores
print scores.mean()
print datetime.now() - start
print

start = datetime.now()
clf = ensemble.ExtraTreesClassifier(n_estimators=500,random_state=1,n_jobs=-1)
scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20)
print clf, np.array(data_bits).shape
print scores
print scores.mean()
print datetime.now() - start
print

start = datetime.now()
clf = ensemble.RandomForestClassifier(n_estimators=500,n_jobs=-1,random_state=1)
scores = cross_validation.cross_val_score(clf, data, y, cv=20)
print clf, np.array(data).shape
print scores
print scores.mean()
print datetime.now() - start
print

start = datetime.now()
clf = ensemble.RandomForestClassifier(n_estimators=500,n_jobs=-1,random_state=1)
scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20)
print clf, np.array(data_bits).shape
print scores
print scores.mean()
print datetime.now() - start
print

start = datetime.now()
clf = svm.SVC(kernel="linear")
scores = cross_validation.cross_val_score(clf, data, y, cv=20)
print clf, np.array(data).shape
print scores
print scores.mean()
print datetime.now() - start
print

start = datetime.now()
clf = svm.SVC(kernel="linear")
scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20)
print clf, np.array(data_bits).shape
print scores
print scores.mean()
print datetime.now() - start
print

## with Evio.RBR

In [None]:
from sklearn import datasets
data, y = datasets.load_digits().data, datasets.load_digits().target
data = pd.DataFrame(data=data)

# OR


In [2]:
data = pd.read_csv('./data/train.csv')

In [3]:
import time
import sys
sys.path.insert(0, '/Users/gokhan/libs')

from Evio import RBR
import pandas as pd

start = time.time()
n_features = 100
rbr = RBR(data)
rbr.generate_features(n_features)
print "generated %d features in %.1f secs" % (n_features, time.time()-start)

Generated 100 features in 1.1 secs
generated 100 features in 1.1 secs


In [4]:
_features = rbr._features

In [5]:
rbr.dump_schema('./temp/rbr_dump')

In [6]:
rbr.load_schema('./temp/rbr_dump')

In [7]:
np.array_equal(_features, rbr._features)

True

In [8]:
rbr._feature_schema.keys()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]