# Setting of stack phase 

At this phase we are going to set the stacked-phase dataset. This method is based just in apply method. It implies less memory.

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from utils.trainFold import getVector

In [3]:
import sys
sys.path.append('../dsbase/src/main')
from AdaBoostClassificationDSBase import AdaBoostClassificationDSBaseModel

## Loading the original stacked dataset and shuffle it

In [17]:
df = pd.read_csv('datasets/train_stack_reduced.csv')

In [None]:
df_frac = df.sample(frac=0.3)

In [6]:
df.shape

(1000, 77)

## Defining the Fold X processing 

### Defining helping functions 

In [7]:
def loadColumnCategoricalOrder(df, columns_categorical):
    columns_categorical_order_dict = {}
    for x in columns_categorical:
        columns_categorical_order_dict[x] = np.where(df.columns == x)[0][0]
    return columns_categorical_order_dict

In [8]:
def loadColumnCategoricalVectors(fold_id, columns_categorical):
    columns_categorical_vectors_dict = {}
    out_path = 'models/fold' + str(fold_id)
    for c in columns_categorical:
        vec = np.load('models/fold' + str(fold_id) + "/" + c + ".sav.npy")
        columns_categorical_vectors_dict[c] = vec
    return columns_categorical_vectors_dict

In [9]:
def loadModel(fold_id):
   # --------------------------------------
    # Load the i-th model and process
    print('   loading model ...')    
    model = AdaBoostClassificationDSBaseModel('AB2',None,None,None,None,None,None)
    model.load('models/fold' + str(fold_id))
    return model

In [10]:
def calculateF1(x, cc, cc_o, cc_v, model):
    xnp = x.values
    acc=0
    for c in cc:
        index = cc_o[c] + acc
        vec = cc_v[c]
        new = getVector(xnp[index], vec)
        xnp = np.delete(xnp, index)
        xnp = np.insert(xnp, index, new)
        acc += (new.size - 1)
    pre_result = model.scalerX.transform(xnp.reshape(1,-1))
    result = model.model.predict_proba(pre_result)
    return result[0,1]

### calculating support variables

In [11]:
df_w = df.drop(['MachineIdentifier','HasDetections','fold'], axis=1)
columns_categorical = df_w.select_dtypes(include=['object']).columns

In [12]:
cc_order = loadColumnCategoricalOrder(df_w,columns_categorical)

In [13]:
N = 9
for i in range(1,N+1):
    print('-------- Process Fold ',i,' -------------------')
    print('loading vectors ...')
    cc_values_f = loadColumnCategoricalVectors(i,columns_categorical)
    print('loading model ...')
    model_f = loadModel(i)
    print('applying folding prediction ...')
    df['f' + str(i)] = df_w.apply(func=calculateF1, axis=1, args=(columns_categorical, cc_order, cc_values_f, model_f))
    # save security DatFrame
    df['f' + str(i)].to_csv('datasets/f_stack.csv.' + str(i))

-------- Process Fold  1  -------------------
loading vectors ...
loading model ...
   loading model ...
initiating empty model AB2. AdaBoostClassification
loading model: models/fold1/AdaBoostClassification_AB2.sav
applying folding prediction ...




-------- Process Fold  2  -------------------
loading vectors ...
loading model ...
   loading model ...
initiating empty model AB2. AdaBoostClassification
loading model: models/fold2/AdaBoostClassification_AB2.sav
applying folding prediction ...
-------- Process Fold  3  -------------------
loading vectors ...
loading model ...
   loading model ...
initiating empty model AB2. AdaBoostClassification
loading model: models/fold3/AdaBoostClassification_AB2.sav
applying folding prediction ...
-------- Process Fold  4  -------------------
loading vectors ...
loading model ...
   loading model ...
initiating empty model AB2. AdaBoostClassification
loading model: models/fold4/AdaBoostClassification_AB2.sav
applying folding prediction ...
-------- Process Fold  5  -------------------
loading vectors ...
loading model ...
   loading model ...
initiating empty model AB2. AdaBoostClassification
loading model: models/fold5/AdaBoostClassification_AB2.sav
applying folding prediction ...
-------- Pro

## Lets obtain the final stacked dataset 

In [14]:
df[['HasDetections','fold','f1','f2','f3','f4','f5','f6','f7','f8','f9']].describe()

Unnamed: 0,HasDetections,fold,f1,f2,f3,f4,f5,f6,f7,f8,f9
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.342,1.0,0.492094,0.490333,0.49545,0.492777,0.491359,0.500872,0.49919,0.489582,0.498563
std,0.474617,0.0,0.000897,0.002987,0.003312,0.000734,0.000817,0.000569,0.00081,0.000862,0.000687
min,0.0,1.0,0.488196,0.485964,0.487663,0.489597,0.487136,0.497999,0.496603,0.482904,0.494666
25%,0.0,1.0,0.491872,0.490126,0.494771,0.492324,0.491184,0.500722,0.498742,0.489307,0.498417
50%,0.0,1.0,0.491872,0.490286,0.495126,0.49295,0.491184,0.500722,0.499337,0.489766,0.498666
75%,1.0,1.0,0.492135,0.490286,0.495392,0.492985,0.491562,0.501151,0.499337,0.489766,0.498811
max,1.0,1.0,0.497794,0.579553,0.504598,0.496742,0.494488,0.503801,0.505086,0.49532,0.501162


In [15]:
df.to_csv('datasets/train_stack_set.csv')

# End of stack train setting!! 

Local Environment: stimated time -> 322 sec / 1000 elements
AWS EC2:  stimated time -> 322 sec / ???elements