In [1]:
import pandas as pd
import numpy as np

import joblib

import sys
sys.path.insert(0, '/Users/gokhan/libs')
from Evio.FeatureGenerator import FeatureGenerator

In [2]:
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)

In [3]:
# create a homogenous non-skewed data sets
trainNonSkewed1 = trainDataFrame[trainDataFrame.TARGET == 1]
trainNonSkewed0 = trainDataFrame[trainDataFrame.TARGET == 0][10500:10500+3500] # increment by 3500
trainNonSkewed = pd.concat([trainNonSkewed1, trainNonSkewed0])

trainNonSkewedLabels = trainNonSkewed['TARGET']
trainNonSkewedFeatures = trainNonSkewed.drop(['ID','TARGET'], axis=1)

In [4]:
print trainDataFrame.shape
print trainNonSkewed0.shape
print trainNonSkewed1.shape

(76020, 308)
(3500, 308)
(3008, 308)


In [5]:
print trainNonSkewedFeatures.shape
print trainNonSkewedLabels.shape

(6508, 306)
(6508,)


In [None]:
fg = FeatureGenerator(n_iters=100, random_state=1)

In [None]:
fg.fit_batch(trainNonSkewedFeatures, trainNonSkewedLabels, n_features=130)

In [None]:
#fg.fit(trainNonSkewedFeatures, trainNonSkewedLabels, warm_start=True)

In [None]:
# save _feature_schema:
# ---------------------
# joblib.dump(fg._feature_schema, './models/features/_feature_schema_non_skewed_3_130_0.8429.joblib')
# to load:
#_feature_schema = joblib.load('./models/features/_feature_schema_non_skewed_2_130_0.8531.joblib')


## CREATE ALL IN LOOP:

In [16]:
# create all non-skewed features iteratively:
trainNonSkewed1 = trainDataFrame[trainDataFrame.TARGET == 1]
for i in range(20):
    print "processing fold %d of 20" % (i+1)
    if i == 19: # if last batch, take all samples till the end
        trainNonSkewed0 = trainDataFrame[trainDataFrame.TARGET == 0][i*3650:]
    else:
        trainNonSkewed0 = trainDataFrame[trainDataFrame.TARGET == 0][i*3650:(i+1)*3650]
    trainNonSkewed = pd.concat([trainNonSkewed1, trainNonSkewed0])

    trainNonSkewedLabels = trainNonSkewed['TARGET']
    trainNonSkewedFeatures = trainNonSkewed.drop(['ID','TARGET'], axis=1)
    
    fg = FeatureGenerator(n_iters=200, random_state=1)
    schema, features = fg.fit_batch(trainNonSkewedFeatures, trainNonSkewedLabels, n_features=250)
    joblib.dump(schema, './models/features/schema_non_skewed_130_features_%d_of_20.joblib' % (i+1))

processing fold 1 of 20
[0] score: 0.8332, score(after drop): 0.8332, score(after add): 0.8373, n_features: 500,  added: 250, dropped: 250, time: 0.3
[1] score: 0.8373, score(after drop): 0.8373, score(after add): 0.8377, n_features: 500,  added: 250, dropped: 250, time: 0.3
[2] score: 0.8377, score(after drop): 0.8377, score(after add): 0.8394, n_features: 500,  added: 250, dropped: 250, time: 0.3
[3] score: 0.8394, score(after drop): 0.8394, score(after add): 0.8426, n_features: 500,  added: 250, dropped: 250, time: 0.3
[4] score: 0.8426, score(after drop): 0.8426, score(after add): 0.8444, n_features: 500,  added: 250, dropped: 250, time: 0.3
[5] score: 0.8444, score(after drop): 0.8444, score(after add): 0.8447, n_features: 500,  added: 250, dropped: 250, time: 0.3
[6] score: 0.8447, score(after drop): 0.8447, score(after add): 0.8450, n_features: 500,  added: 250, dropped: 250, time: 0.3
[7] score: 0.8450, score(after drop): 0.8450, score(after add): 0.8458, n_features: 500,  adde

## MERGE SCHEMAS

In [2]:
i = 0
merged_schema = {}
for fold in range(20):
    _feature_schema = joblib.load('./models/features/schema_non_skewed_130_features_%d_of_20.joblib' % (fold+1))
    for schema in _feature_schema.values():
        merged_schema[i] = schema
        i += 1

In [3]:
len(merged_schema.keys())

5000

In [4]:
len(set(merged_schema.keys()))

5000

In [5]:
joblib.dump(merged_schema, './models/features/merged_schema_5000_features.joblib')

['./models/features/merged_schema_5000_features.joblib',
 './models/features/merged_schema_5000_features.joblib_01.npy',
 './models/features/merged_schema_5000_features.joblib_02.npy',
 './models/features/merged_schema_5000_features.joblib_03.npy',
 './models/features/merged_schema_5000_features.joblib_04.npy',
 './models/features/merged_schema_5000_features.joblib_05.npy',
 './models/features/merged_schema_5000_features.joblib_06.npy',
 './models/features/merged_schema_5000_features.joblib_07.npy',
 './models/features/merged_schema_5000_features.joblib_08.npy',
 './models/features/merged_schema_5000_features.joblib_09.npy',
 './models/features/merged_schema_5000_features.joblib_10.npy',
 './models/features/merged_schema_5000_features.joblib_11.npy',
 './models/features/merged_schema_5000_features.joblib_12.npy',
 './models/features/merged_schema_5000_features.joblib_13.npy',
 './models/features/merged_schema_5000_features.joblib_14.npy',
 './models/features/merged_schema_5000_features