# Setting of stack phase 

At this phase we are going to set the stacked-phase dataset 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from utils.trainFold import loadColumnsCategorical

In [3]:
import sys
sys.path.append('../dsbase/src/main')
from AdaBoostClassificationDSBase import AdaBoostClassificationDSBaseModel

## Loading the original stacked dataset and shuffle it

In [4]:
df = pd.read_csv('datasets/train_stack.csv')

In [5]:
df_frac = df.sample(frac=0.30)

In [6]:
df_frac.shape

(8149, 77)

## Defining the Fold X processing 

In [7]:
def getColumnFoldX(df, fold_id):
    # Convert every element to a one-elenet List
    print('   dataframe to list ...')
    df_w = df.drop(['MachineIdentifier','HasDetections','fold'], axis=1)
    columns_categorical = df_w.select_dtypes(include=['object']).columns

    # Comluns transformation
    df_end = loadColumnsCategorical(fold_id, df_w, columns_categorical)
    
    # --------------------------------------
    # Load the i-th model and process
    print('   loading model ...')    
    model = AdaBoostClassificationDSBaseModel('AB2',None,None,None,None,None,None)
    model.load('models/fold' + str(fold_id))
    
    print('   Calculating: normalization ...')    
    pre_result = model.scalerX.transform(df_end.values)
    print('   Calculating: probabilities ...')    
    result = model.model.predict_proba(pre_result)
    
    # Set the result as a one-column DataFrame
    print('   Creating result dataset ...')        
    columns_name = [str('f' + str(fold_id))]
    df_result = pd.DataFrame(result[:,1])
    df_result.columns = columns_name
    return df_result

## Lets obtain the final stacked dataset 

In [8]:
N = 9 # Number of folds
df_stack_set = df_frac.reset_index(drop=True)
for i in range(9):
    print('processing fold ' + str(i+1) + " ...")
    c = getColumnFoldX(df_frac, i+1)
    df_stack_set = df_stack_set.join(c)

processing fold 1 ...
   dataframe to list ...
   column "ProductName" transformation ...
   column "EngineVersion" transformation ...
   column "AppVersion" transformation ...
   column "AvSigVersion" transformation ...
   column "Platform" transformation ...
   column "Processor" transformation ...
   column "OsVer" transformation ...
   column "OsPlatformSubRelease" transformation ...
   column "OsBuildLab" transformation ...
   column "SkuEdition" transformation ...
   column "SmartScreen" transformation ...
   column "Census_MDC2FormFactor" transformation ...
   column "Census_DeviceFamily" transformation ...
   column "Census_PrimaryDiskTypeName" transformation ...
   column "Census_ChassisTypeName" transformation ...
   column "Census_PowerPlatformRoleName" transformation ...
   column "Census_OSVersion" transformation ...
   column "Census_OSArchitecture" transformation ...
   column "Census_OSBranch" transformation ...
   column "Census_OSEdition" transformation ...
   column 



   Calculating: probabilities ...
   Creating result dataset ...
processing fold 2 ...
   dataframe to list ...
   column "ProductName" transformation ...
   column "EngineVersion" transformation ...
   column "AppVersion" transformation ...
   column "AvSigVersion" transformation ...
   column "Platform" transformation ...
   column "Processor" transformation ...
   column "OsVer" transformation ...
   column "OsPlatformSubRelease" transformation ...
   column "OsBuildLab" transformation ...
   column "SkuEdition" transformation ...
   column "SmartScreen" transformation ...
   column "Census_MDC2FormFactor" transformation ...
   column "Census_DeviceFamily" transformation ...
   column "Census_PrimaryDiskTypeName" transformation ...
   column "Census_ChassisTypeName" transformation ...
   column "Census_PowerPlatformRoleName" transformation ...
   column "Census_OSVersion" transformation ...
   column "Census_OSArchitecture" transformation ...
   column "Census_OSBranch" transformati

   column "OsBuildLab" transformation ...
   column "SkuEdition" transformation ...
   column "SmartScreen" transformation ...
   column "Census_MDC2FormFactor" transformation ...
   column "Census_DeviceFamily" transformation ...
   column "Census_PrimaryDiskTypeName" transformation ...
   column "Census_ChassisTypeName" transformation ...
   column "Census_PowerPlatformRoleName" transformation ...
   column "Census_OSVersion" transformation ...
   column "Census_OSArchitecture" transformation ...
   column "Census_OSBranch" transformation ...
   column "Census_OSEdition" transformation ...
   column "Census_OSSkuName" transformation ...
   column "Census_OSInstallTypeName" transformation ...
   column "Census_OSWUAutoUpdateOptionsName" transformation ...
   column "Census_GenuineStateName" transformation ...
   column "Census_ActivationChannel" transformation ...
   column "Census_FlightRing" transformation ...
   loading model ...
initiating empty model AB2. AdaBoostClassification
l

In [9]:
df_stack_set[['HasDetections','fold','f1','f2','f3','f4','f5','f6','f7','f8','f9']].describe()

Unnamed: 0,HasDetections,fold,f1,f2,f3,f4,f5,f6,f7,f8,f9
count,8149.0,8149.0,8149.0,8149.0,8149.0,8149.0,8149.0,8149.0,8149.0,8149.0,8149.0
mean,0.52215,4.998282,0.492093,0.490112,0.495558,0.492545,0.489068,0.500882,0.499127,0.489572,0.498334
std,0.49954,2.582226,0.000838,0.001981,0.003637,0.000764,0.002091,0.000595,0.000768,0.000812,0.000969
min,0.0,1.0,0.488962,0.486171,0.48679,0.488261,0.484765,0.497371,0.495818,0.482445,0.492209
25%,0.0,3.0,0.491872,0.489779,0.494771,0.492096,0.487136,0.500722,0.498742,0.489307,0.498271
50%,1.0,5.0,0.491872,0.490126,0.495126,0.49295,0.487514,0.500722,0.499337,0.489766,0.498666
75%,1.0,7.0,0.492211,0.490286,0.495919,0.492985,0.491184,0.501151,0.499337,0.489766,0.498811
max,1.0,9.0,0.497454,0.583123,0.504982,0.496295,0.495822,0.503437,0.504923,0.495602,0.501204


In [10]:
df_stack_set.to_csv('datasets/train_stack_set.csv')

# End of stack train setting!! 