In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys

import joblib

from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD
from keras.regularizers import l2

sys.path.insert(0, '/Users/gokhan/libs')
from Evio.FeatureGenerator import FeatureGenerator

Using Theano backend.


In [None]:
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)

trainLabels = trainDataFrame['TARGET']
trainFeatures = trainDataFrame.drop(['ID','TARGET'], axis=1)

In [None]:
# create a homogenous non-skewed data set
trainNonSkewed1 = trainDataFrame[trainDataFrame.TARGET == 1]
trainNonSkewed0 = trainDataFrame[trainDataFrame.TARGET == 0][0:3500]
trainNonSkewed = pd.concat([trainNonSkewed1, trainNonSkewed0])

trainNonSkewedLabels = trainNonSkewed['TARGET']
trainNonSkewedFeatures = trainNonSkewed.drop(['ID','TARGET'], axis=1)

In [None]:
trainNonSkewed0_1 = trainDataFrame[trainDataFrame.TARGET == 0][0:3500]

In [None]:
print trainDataFrame.shape
print trainNonSkewed0_1.shape
print trainNonSkewed1.shape

In [None]:
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
    trainNonSkewedFeatures, trainNonSkewedLabels, test_size=0.3, random_state=1)

In [None]:
# scale data:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Don't cheat - fit only on training data
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled)

X_valid_scaled = scaler.transform(X_valid)
X_valid_scaled = pd.DataFrame(X_valid_scaled)

In [None]:
import network2

In [None]:
def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((2, 1))
    e[j] = 1.0
    return e

In [None]:
# For scaled data:
X_train_np = X_train_scaled.as_matrix()
y_train_np = y_train.as_matrix()
data_train = [(X_train_np[i,:], y_train_np[i]) for i in xrange(len(X_train_np))]
data_train = [(np.reshape(x, (306, 1)), vectorized_result(y)) for x,y in data_train ]

X_valid_np = X_valid_scaled.as_matrix()
y_valid_np = y_valid.as_matrix()
data_valid = [(X_valid_np[i,:], y_valid_np[i]) for i in xrange(len(X_valid_np))]
data_valid = [(np.reshape(x, (306, 1)), y) for x,y in data_valid ]

In [None]:
print len(data_train)
print len(data_valid)

In [None]:
net = network2.Network([306, 100, 100, 2])

In [None]:
e_cost, e_acc, tr_cost, tr_acc = net.SGD(data_train, epochs=250, mini_batch_size=10, eta=.01, lmbda=8.0, evaluation_data=data_valid,
       monitor_evaluation_cost=True, monitor_evaluation_accuracy=True, monitor_training_cost=True, 
       monitor_training_accuracy=True)

In [None]:
net.scores["val_acc"]

In [None]:
plt.plot(e_cost, label='Validation Cost')

plt.plot(tr_cost, label='Training Cost')

#plt.legend(loc="lower right")
plt.show()

In [None]:
plt.plot(np.array(e_acc)/1909.0, label='Validation Accuracy')
plt.plot(np.array(tr_acc)/4452.0, label='Training Accuracy')
plt.legend(loc="lower right")
plt.show()

In [None]:
trainNonSkewedLabels.describe()

## TRY WITH 10,000 RANDOM FEATURES FROM RCE3

In [None]:
import sys
sys.path.insert(0, "/Users/gokhan/libs")
from Evio.evolver import RandomConceptEvolver3

rce3 = RandomConceptEvolver3(n_concepts=10000, n_gens=1, random_state=1, mate_selection="roulette",
                             eval_features_every_n_gens=0,
                             eliminate_perc_features=0.0, eval_metric="auc",
                             n_attrs_per_feature=[(1, 0.05), (2, 0.25), (3, 0.25), (4, 0.25), (5, 0.2)])
rce3.n_samples = trainNonSkewedFeatures.shape[0]
#rce3.fit(trainNonSkewedFeatures, trainNonSkewedLabels,2,1)
rce3._generate_features(trainNonSkewedFeatures)

In [None]:
X_train_rce3, X_valid_rce3, y_train, y_valid = cross_validation.train_test_split(
    rce3._features, trainNonSkewedLabels, test_size=0.3, random_state=1)

# For scaled data:
X_train_rce3_np = X_train_rce3
#y_train_np = y_train.as_matrix()
data_train_rce3 = [(X_train_rce3_np[i,:], y_train_np[i]) for i in xrange(len(X_train_rce3_np))]
data_train_rce3 = [(np.reshape(x, (10000, 1)), vectorized_result(y)) for x,y in data_train_rce3 ]

X_valid_rce3_np = X_valid_rce3
#y_valid_np = y_valid.as_matrix()
data_valid_rce3 = [(X_valid_rce3_np[i,:], y_valid_np[i]) for i in xrange(len(X_valid_rce3_np))]
data_valid_rce3 = [(np.reshape(x, (10000, 1)), y) for x,y in data_valid_rce3 ]

In [None]:
print len(data_train_rce3)
print len(data_valid_rce3)

In [None]:
net = network2.Network([10000, 100, 2])

In [None]:
e_cost, e_acc, tr_cost, tr_acc = net.SGD(data_train_rce3, epochs=250, mini_batch_size=10, eta=.0005, lmbda=5.0, 
                                            evaluation_data=data_valid_rce3, monitor_evaluation_cost=True, 
                                            monitor_evaluation_accuracy=True, monitor_training_cost=True, 
                                            monitor_training_accuracy=True)

## TRY WITH BEST ~3900 BINARY FEATURES 

In [None]:
trainTransformedDF = pd.read_csv('./data/trainTransformed.csv')

In [None]:
trainTransformedDF.shape

In [None]:
dataTarget0 = trainTransformedDF[trainTransformedDF.TARGET == 0]
dataTarget1 = trainTransformedDF[trainTransformedDF.TARGET == 1]

def getBalancedTrainAndValidationSets():
    # shuffle
    dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dataTarget0[0:1500]
    trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    
    val0 = dataTarget0[1500:3000]
    val1 = dataTarget1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
    
    return X_train, y_train, X_val, y_val

In [None]:
X_train_rce3, y_train, X_valid_rce3, y_valid = getBalancedTrainAndValidationSets()

In [None]:
print X_train_rce3.shape
print y_train.shape
print X_valid_rce3.shape
print y_valid.shape

In [None]:

# For scaled data:
X_train_rce3_np = X_train_rce3.as_matrix()
y_train_np = y_train.as_matrix()
data_train_rce3 = [(X_train_rce3_np[i,:], y_train_np[i]) for i in xrange(len(X_train_rce3_np))]
data_train_rce3 = [(np.reshape(x, (3920, 1)), vectorized_result(y)) for x,y in data_train_rce3 ]

X_valid_rce3_np = X_valid_rce3.as_matrix()
y_valid_np = y_valid.as_matrix()
data_valid_rce3 = [(X_valid_rce3_np[i,:], y_valid_np[i]) for i in xrange(len(X_valid_rce3_np))]
data_valid_rce3 = [(np.reshape(x, (3920, 1)), y) for x,y in data_valid_rce3 ]

In [None]:
print len(data_train_rce3)
print len(data_valid_rce3)

In [None]:
net = network2.Network([3920, 100, 2])
e_cost, e_acc, tr_cost, tr_acc = net.SGD(data_train_rce3, epochs=50, mini_batch_size=10, eta=.01, lmbda=0.0, 
                                            evaluation_data=data_valid_rce3, monitor_evaluation_cost=True, 
                                            monitor_evaluation_accuracy=True, monitor_training_cost=True, 
                                            monitor_training_accuracy=True)

## KERAS cross-validated w best 3920 features or original scaled features
for optimizers: http://keras.io/optimizers/#usage-of-optimizers
for regularizers: http://keras.io/regularizers/#usage-of-regularizers


In [10]:
def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((2, 1))
    e[j] = 1.0
    return e

### BEST 3920 features

In [None]:
trainTransformedDF = pd.read_csv('./data/trainTransformed.csv')

In [None]:
trainTransformedDF.shape

### ORIGINAL SCALED FEATURES

In [None]:
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)

trainLabels = trainDataFrame['TARGET']
#trainFeatures = trainDataFrame.drop(['ID','TARGET'], axis=1)
trainTransformedDF = trainDataFrame.drop(['ID'], axis=1)

### 10,000 RANDOM FEATURES (not checked for uniqueness)

In [2]:
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)

trainLabels = trainDataFrame['TARGET']
trainFeatures = trainDataFrame.drop(['ID','TARGET'], axis=1)
#trainTransformedDF = trainDataFrame.drop(['ID'], axis=1)

In [3]:
print "trainDataFrame.shape: %s" % str(trainDataFrame.shape)
print "trainLabels.shape: %s" % str(trainLabels.shape)
print "trainFeatures.shape: %s" % str(trainFeatures.shape)

trainDataFrame.shape: (76020, 308)
trainLabels.shape: (76020,)
trainFeatures.shape: (76020, 306)


In [4]:

n_features = 1000
schema_10K = {}
features_10K = pd.DataFrame(index=range(trainFeatures.shape[0]))
for i in range(10):
    start = time.time()
    fg = FeatureGenerator(n_attrs_per_feature=[(1, 0.1), (2, 0.25), (3, 0.25), (4, 0.25), (5, 0.15)], random_state=1)
    fg.last_feat_ix = i*n_features
    schema, features = fg._generate_random_features(n_features, eval_score=False, X=trainFeatures)
    print "took %.1f mins to generate %d features" % ((time.time()-start)/60., n_features)
    schema_10K.update(schema)
    features_10K = pd.concat([features_10K, features], axis=1)
    print "took total %.1f mins to complete step %d" % ((time.time()-start)/60., i)
    

took 1.2 mins to generate 1000 features
took total 1.3 mins to complete step 0
took 2.6 mins to generate 1000 features
took total 2.6 mins to complete step 1
took 3.9 mins to generate 1000 features
took total 4.0 mins to complete step 2
took 5.3 mins to generate 1000 features
took total 5.4 mins to complete step 3
took 6.7 mins to generate 1000 features
took total 6.9 mins to complete step 4
took 8.2 mins to generate 1000 features
took total 8.6 mins to complete step 5
took 9.8 mins to generate 1000 features
took total 10.3 mins to complete step 6
took 11.5 mins to generate 1000 features
took total 12.1 mins to complete step 7
took 13.3 mins to generate 1000 features
took total 14.0 mins to complete step 8
took 15.3 mins to generate 1000 features
took total 16.0 mins to complete step 9


In [5]:
print "features_10K.shape: %s" % str(features_10K.shape)
print "len(schema_10K): %d" % len(schema_10K)

features_10K.shape: (76020, 10000)
len(schema_10K): 10000


In [6]:
trainTransformedDF = pd.concat([features_10K, trainLabels], axis=1)

In [7]:
print "trainTransformedDF.shape: %s" % str(trainTransformedDF.shape)

trainTransformedDF.shape: (76020, 10001)


### register cross validation function (with ability to scale data):

In [8]:
dataTarget0 = trainTransformedDF[trainTransformedDF.TARGET == 0]
dataTarget1 = trainTransformedDF[trainTransformedDF.TARGET == 1]

def getBalancedTrainAndValidationSets(scale=False):
    # shuffle
    dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dataTarget0[0:1500]
    trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    
    if scale:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_train = pd.DataFrame(X_train)
        
    val0 = dataTarget0[1500:3000]
    val1 = dataTarget1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
    
    if scale:
        X_val = scaler.transform(X_val)
        X_val = pd.DataFrame(X_val)
        
    return X_train, y_train, X_val, y_val

## MANUAL EVALUATION

In [None]:
# save weights of the model! Beware, this saves ONLY weights. The model architecture must be reconstructed prior to load_weights()
# to load weights: model.load_weights(weights_path)
model_id = "./models/keras_best_3920_50_2_sigm_softm_bin_crosent_sgd"
checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", monitor='val_acc', verbose=1, save_best_only=True)



In [None]:
model = Sequential()
model.add(Dense(output_dim=50, input_dim=3920))
model.add(Activation('sigmoid')) # try also 'relu'
model.add(Dense(output_dim=2))
model.add(Activation("softmax"))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

config = model.get_config()
joblib.dump(config, model_id+'.joblib')
# to load config:
# config = joblib.load(model_id+'.joblib')
# to load (initiate) a saved model:
# model = Sequential.from_config(config)


In [None]:
X_train_rce3, y_train, X_valid_rce3, y_valid = getBalancedTrainAndValidationSets()
y_train_vect = np.array([vectorized_result(x) for x in y_train])
y_train_vect = np.reshape(y_train_vect, (y_train_vect.shape[0], 2))
y_valid_vect = np.array([vectorized_result(x) for x in y_valid])
y_valid_vect = np.reshape(y_valid_vect, (y_valid_vect.shape[0], 2))


In [None]:
hist = model.fit(X_train_rce3.as_matrix(), y_train_vect, verbose=1, batch_size=32, 
                 nb_epoch=5, validation_data=(X_valid_rce3.as_matrix(), y_valid_vect), callbacks=[checkpointer])

In [None]:
model = Sequential()
model.add(Dense(output_dim=50, input_dim=3920))
model.add(Activation('sigmoid')) # try also 'relu'
model.add(Dense(output_dim=2))
model.add(Activation("softmax"))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", monitor='val_acc', verbose=1, save_best_only=True)


In [None]:
# to load (initiate) a saved model:
config = joblib.load(model_id+'.joblib')
model = Sequential.from_config(config)
model.load_weights(model_id+'.hdf5')
# compile params cannot be loaded from model config file: # Do NOT change them!!!
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])  


In [None]:
loss_and_metrics = model.evaluate(X_valid_rce3.as_matrix(), y_valid_vect, verbose=1)
print loss_and_metrics

In [None]:
model.predict_proba(X_valid_rce3.as_matrix())[:,1]

In [None]:
roc_auc_score(y_valid, model.predict_proba(X_valid_rce3.as_matrix())[:,1])

## CROSS VALIDATION

In [1]:
# check cross validated performance
n_folds = 5
scaled = False
trn_scores = []
val_scores = []
start = time.time()
for i in range(n_folds):
    # set model_id and checkpointer object
    model_id = "./models/keras/last_keras_model_checkpoint"
    # to monitor validation accuracy:
    checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", monitor='val_loss', verbose=1, save_best_only=False)
    # to monitor validation loss:
    # checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", verbose=1, save_best_only=True)
    # get fold
    X_train_rce3, y_train, X_valid_rce3, y_valid = getBalancedTrainAndValidationSets(scale=scaled)
    y_train_vect = np.array([vectorized_result(x) for x in y_train])
    y_train_vect = np.reshape(y_train_vect, (y_train_vect.shape[0], 2))
    y_valid_vect = np.array([vectorized_result(x) for x in y_valid])
    y_valid_vect = np.reshape(y_valid_vect, (y_valid_vect.shape[0], 2))
    # create Sequential model
    model = Sequential()
    # 1. hidden layer
    model.add(Dense(output_dim=100, input_dim=3920, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    model.add(Activation('sigmoid'))
    # 2. hidden
    #model.add(Dense(output_dim=100, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    #model.add(Activation('sigmoid'))
    
    #model.add(Dense(output_dim=100, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    #model.add(Activation('sigmoid'))
    
    #model.add(Dense(output_dim=100, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    #model.add(Activation('sigmoid'))
    # output layer
    model.add(Dense(output_dim=2, W_regularizer=l2(0.01))) #, W_regularizer=l2(0.1)
    model.add(Activation("softmax"))
    # SGD(lr=0.005, momentum=0.1, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

    model.fit(X_train_rce3.as_matrix(), y_train_vect, verbose=0, batch_size=10, 
                 nb_epoch=100, validation_data=(X_valid_rce3.as_matrix(), y_valid_vect), callbacks=[checkpointer])
    
    trn_score = roc_auc_score(y_train, model.predict_proba(X_train_rce3.as_matrix())[:,1])
    val_score = roc_auc_score(y_valid, model.predict_proba(X_valid_rce3.as_matrix())[:,1])
    print "train auc: %.4f" % trn_score
    print "validation auc: %.4f" % val_score
    trn_scores.append(trn_score)
    val_scores.append(val_score)

# save model architecture
#config = model.get_config()
#joblib.dump(config, model_id+'.joblib')

print "-----------------------"
print "%d fold cross validation with %s data" % (n_folds, "SCALED" if scaled else "NON-SCALED")
print "Train Mean: %.4f" % np.mean(trn_scores)
print "Validation Mean: %.4f" % np.mean(val_scores)
print "Total time (mins): %.1f" % ((time.time()-start)/60.)

NameError: name 'time' is not defined

## CREATE SUBMISSION

In [None]:
testDataFrame = pd.read_csv('./data/test.csv')

# remove same columns as in training

testDataFrame.drop(colsToRemove1, axis=1, inplace=True)
testDataFrame.drop(colsToRemove1, axis=1, inplace=True)

testIDs = trainDataFrame['ID']
testFeatures = trainDataFrame.drop(['ID','TARGET'], axis=1)