# STEP 2 B: From categorical to One-Hot Vector and Segmentation and fold training

From categorical to One-Hot Vector dataset and segmentations of the different training "folds". In the section, we will compute the fold training instead of exporting the files.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from utils.trainFoldB import train_test, saveColumnsToPartition

# Data Transformation

## Data Loading 

In [None]:
df = pd.read_csv('datasets/train_cleaned.csv', index_col='MachineIdentifier')

## Categorical Data Analysis 

In [None]:
columns_categorical = df.select_dtypes(include=['object']).columns

In [None]:
# Threshold of the size of different values of each categorical data
h_threshold = 350

In [None]:
total = 0
columns_to_partition = []
columns_to_onehot = []
for c in columns_categorical:
    values = df[c].nunique()
    suf = ""
    if (values > h_threshold):
        columns_to_partition.append(c)
        suf = ', PARTITION'
    else:
        columns_to_onehot.append(c)
    total += values
    print(c,': ',values,suf)
print('Total new vars: ' + str(total))

## Segmentation and train

We are going to split the dataset based on the most scattered categorical variables  

In [None]:
columns_to_partition

In [None]:
asvGB = df.groupby(columns_to_partition)['ProductName'].count()

In [None]:
# ... Maybe it is not a good idea to order this this way
#asvGB.sort_values(ascending=False, inplace=True)

In [None]:
asvGBdf=asvGB.to_frame()
asvGBdf.columns=['count']
asvGBdf.sort_index(level=[0,1,2],inplace=True)

In [None]:
plt.plot(asvGBdf['count'].values)

Now, let's select indicators for each group and create a dictionary. Each group will have 200.000 elements aprox:

In [None]:
NE = 364000

In [None]:
asvGBDict = asvGBdf.to_dict()

In [None]:
acc = 0
f=1 # fold

foldDict = {}
for key, value in asvGBDict['count'].items():
    acc += value
    if (acc > NE):
        acc = 0
        f += 1
    foldDict[key]=f

In [None]:
def setFold(row):
    real_row = row.values
    return foldDict[(real_row[0],real_row[1],real_row[2])]

In [None]:
df['fold'] = df[columns_to_partition].apply(axis=1, func=setFold)

In [None]:
sns.countplot(data=df, x='fold')

We have all this folds to train, let's  get the different files with 50% of data to train the ensembled block:

In [None]:
nFolds = df['fold'].nunique()

In [None]:
scores = []
lcurves = []
for i in range(nFolds):
    print('processing fold ',(i+1),' ... ')
    fold_df = df[df['fold']==(i+1)]
    columns_categorical = fold_df.select_dtypes(include=['object']).columns
    fold_df_num=pd.get_dummies(data=fold_df,columns=columns_categorical)    
    m=fold_df.shape[0]
    tm = int(m/2)
    ensemble_df = fold_df_num[0:tm]
    stack_df = fold_df[tm:]

    # Save columns to partition values
    saveColumnsToPartition(i+1, fold_df, columns_to_partition)
    
    # ensemble process
    ensemble_df.drop(labels=['fold'],axis=1,inplace=True)
    (score, lcurve) = train_test(i+1,ensemble_df)
    scores.append(score)
    lcurves.append(lcurve)
    
    # stacking process
    if (i == 0):
        stack_complete_df = stack_df.copy()
    else:
        stack_complete_df = pd.concat([stack_complete_df, stack_df])

In [None]:
stack_complete_df.to_csv('datasets/train_stack.csv')

## Show the results of the training of each fold 

In [None]:
def showFoldModelInfo(fold_id):
    print('scoring: ',str(scores[i-1]))
    plt.plot(lcurves[i-1][0,:],'b',lcurves[i-1][1,:],'r')

In [None]:
showFoldModelInfo(1)

In [None]:
showFoldModelInfo(2)

In [None]:
showFoldModelInfo(3)

In [None]:
showFoldModelInfo(4)

In [None]:
showFoldModelInfo(5)

In [None]:
showFoldModelInfo(6)

In [None]:
showFoldModelInfo(7)

In [None]:
showFoldModelInfo(8)

In [None]:
showFoldModelInfo(9)

# End of Analisys! 

After this execution we have the "folded" models and the "stacking" dataset!