In [207]:
import pandas as pd
import numpy as np

from sklearn  import base
from sklearn.model_selection import train_test_split, KFold

In [208]:
########################### Creo Training Set ##############################
tr_letters= 'A B A B A C'.split()
tr_labels= [1, 4, 2, 0, 0, 6]

train= pd.DataFrame()
train['letter']= tr_letters
train['label']= tr_labels

#Mi creo una copia
copy_train= train.copy()

train

Unnamed: 0,letter,label
0,A,1
1,B,4
2,A,2
3,B,0
4,A,0
5,C,6


In [209]:
################ Creo Test: ci metto una categoria nota, A, e un'altra non vista nel Train, D ########################
test_letters= 'A D'.split()
test_labels= [10, 8]

test= pd.DataFrame()
test['letter']= test_letters
test['label']= test_labels

#Mi creo una copia
copy_test= test.copy()

test

Unnamed: 0,letter,label
0,A,10
1,D,8


## <font color = 'orange' > Faccio Encoding Sul Train

In [210]:
kf = KFold(n_splits = 3, shuffle = False, random_state= 0)

name_encoded_mean= train.columns[0] + '_encoded_mean'
name_encoded_std= train.columns[0] + '_encoded_std'

column_to_encode= 'letter'
target_column= 'label'

mean_of_target = train[target_column].mean()  #media di tutto il target, not grouped
std_of_target= train[target_column].std()  # std di tutto il target, not grouped

count= 1
for tr_ind, val_ind in kf.split(train):
    print('\tITERATION K-FOLD{} \ntrain indexes: {}, val indexes to be filled: {} '.format(count, tr_ind, val_ind))  #sono gli indici delle folds
    X_tr, X_val = train.iloc[tr_ind], train.iloc[val_ind]
    train.loc[train.index[val_ind], name_encoded_mean] = X_val[column_to_encode].map(X_tr.groupby(column_to_encode)[target_column].mean())
    train.loc[train.index[val_ind], name_encoded_std] = X_val[column_to_encode].map(X_tr.groupby(column_to_encode)[target_column].std())
    
    #train[name_encoded_mean].fillna(mean_of_target, inplace = True) #ai NaN metto la media di tutto il Target
    #train[name_encoded_std].fillna('NaN, Less than 2 values', inplace = True)   # ai NaN metto la std dev di tutto il Target
    
    count+=1
    
train

	ITERATION K-FOLD1 
train indexes: [2 3 4 5], val indexes to be filled: [0 1] 
	ITERATION K-FOLD2 
train indexes: [0 1 4 5], val indexes to be filled: [2 3] 
	ITERATION K-FOLD3 
train indexes: [0 1 2 3], val indexes to be filled: [4 5] 


Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std
0,A,1,1.0,1.414214
1,B,4,0.0,
2,A,2,0.5,0.707107
3,B,0,4.0,
4,A,0,1.5,0.707107
5,C,6,,


#### NB: di default Python usa std campionaria (N-1 al denominatore.) Nella 1° iterazione, valori di A nelle due fold sono [0,2] e Sample std= 1.414

In [211]:
def std_dev(array):
    media= np.mean(array)
    return np.sqrt(np.mean((array - media)**2))

def std_dev_campionaria(array):
    media= np.mean(array)
    return np.sqrt(np.sum((array - media)**2)/ (len(array) -1))

print("Dato l'array {} \nla std dev (sigma) è {}, mentre S, quella campionaria (con N-1 al denominatore) è {}".format(np.array([2,0]), 
                                                    std_dev(np.array([2,0])), round(std_dev_campionaria(np.array([2,0])),3 ) ))

Dato l'array [2 0] 
la std dev (sigma) è 1.0, mentre S, quella campionaria (con N-1 al denominatore) è 1.414


## Fillo i Nan rispettivamente con media e std della popolazione

In [212]:
print("Mean and std of the whole target: ", mean_of_target, std_of_target )

train[name_encoded_mean].fillna(mean_of_target, inplace = True) #ai NaN metto la media di tutto il Target
train[name_encoded_std].fillna(std_of_target, inplace = True)   # ai NaN metto la std dev di tutto il Target

train

Mean and std of the whole target:  2.1666666666666665 2.401388487243717


Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std
0,A,1,1.0,1.414214
1,B,4,0.0,2.401388
2,A,2,0.5,0.707107
3,B,0,4.0,2.401388
4,A,0,1.5,0.707107
5,C,6,2.166667,2.401388


## <font color = 'orange' > Ora Encoding sul Test

In [213]:
test

Unnamed: 0,letter,label
0,A,10
1,D,8


In [214]:
test[name_encoded_mean]= test['letter'].map(train.groupby('letter').mean()['letter_encoded_mean']) #mappo categorie con le loro medie sul train
test[name_encoded_std]= test['letter'].map(train.groupby('letter').mean()['letter_encoded_std']) #metto std_dev media sul train
test

Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std
0,A,10,1.0,0.942809
1,D,8,,


## Fillo mean e std di nuove categorie nel Test con le rispettive medie nel Train

In [215]:
mean_letter_encoded_tr= train['letter_encoded_mean'].mean()
std_letter_encoded_tr= train['letter_encoded_std'].mean()

test['letter_encoded_mean'].fillna(mean_letter_encoded_tr, inplace = True) #ai NaN metto la media di tutto il Target
test['letter_encoded_std'].fillna(std_letter_encoded_tr, inplace = True)   # ai NaN metto la std dev di tutto il Target

test

Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std
0,A,10,1.0,0.942809
1,D,8,1.527778,1.672099


# <font color = 'orange'> Final Step, aggiungo a Train e Test il Noise proporzionale a std dev

In [216]:
np.random.seed(2)  #fisso un random seed

train['ENCODED WITH NOISE']= np.random.normal(train['letter_encoded_mean'], train['letter_encoded_std'], size= len(train))

train

Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std,ENCODED WITH NOISE
0,A,1,1.0,1.414214,0.410615
1,B,4,0.0,2.401388,-0.135119
2,A,2,0.5,0.707107,-1.010519
3,B,0,4.0,2.401388,7.938927
4,A,0,1.5,0.707107,0.23185
5,C,6,2.166667,2.401388,0.145304


In [217]:
test['ENCODED WITH NOISE']= np.random.normal(test['letter_encoded_mean'], test['letter_encoded_std'], size= len(test))

test

Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std,ENCODED WITH NOISE
0,A,10,1.0,0.942809,1.474121
1,D,8,1.527778,1.672099,-0.554467


In [218]:
######################################################################################################################

# <center> <font color = 'red' > Generalizzo tutto con una classe

In [219]:
class KFoldTargetEncoderTrain_std(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):

        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self

    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()  #media di tutto il target, not grouped
        std_of_target= X[self.targetName].std()  # std di tutto il target, not grouped
        #print('mean and std of target on the whole Training Set: ', mean_of_target, std_of_target)
        
        kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)

        col_mean_name = self.colnames + '_' + 'enc_mean'
        col_std_name= self.colnames + '_' + 'enc_std'
        
        #Le inizializzo con tutti NaN
        X[col_mean_name] = np.nan
        X[col_std_name] = np.nan
        
        #A ogni giro (per ogni K-Fold) fillo valori della Test Fold con la media delle Train Fold.
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            print('train and val indexes per fold: ', tr_ind, val_ind)  #sono gli indici delle folds
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
            X.loc[X.index[val_ind], col_std_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].std())
            
            #print('per riga {}, la media è {}'.format(tr_ind, X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())))
            #print('per riga {}, la std è {}'.format(tr_ind, X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].std())))
        
        X[col_mean_name].fillna(mean_of_target, inplace = True) #ai NaN metto la media di tutto il Target
        X[col_std_name].fillna(std_of_target, inplace = True)   # ai NaN metto la std dev di tutto il Target

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
                                    self.targetName,np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
            
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
            
        return X
    
    
    
class KFoldTargetEncoderTest_std(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,Train_encoded_mean, Train_encoded_std):
        
        self.train = train
        self.colNames = colNames
        self.Train_encoded_mean = Train_encoded_mean
        self.Train_encoded_std = Train_encoded_std
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):

        ### Media e std del Target grouper per categoria su tutto il Train
        category_mean_tr = self.train[[self.colNames,self.Train_encoded_mean]].groupby(self.colNames).mean() #.reset_index() 
        category_std_tr= self.train[[self.colNames,self.Train_encoded_std]].groupby(self.colNames).mean() #.reset_index() 
        
        ### Media dell'Encoding basato su mean e std sul Train (non grouped per categoria)
        population_mean_tr= self.train[self.Train_encoded_mean].mean()
        population_std_tr= self.train[self.Train_encoded_std].mean()
        
        
        ### Metto nel Test i valori medi della mean e std del Train per quella categoria
        X[self.Train_encoded_mean]= X[self.colNames].map(category_mean_tr.squeeze().to_dict())
        X[self.Train_encoded_std]= X[self.colNames].map(category_std_tr.squeeze().to_dict())
        
        ### Fillo i Missing Values (Categorie Nuove) con i valori medi di mean e std (encodate) di tutte la variabile nel Train
        print('Filled {} Missing Values with Average mean and std equals to {}, {}'.format(
                                                        np.sum(np.sum(X.isna())), population_mean_tr , population_std_tr))
        
        X[self.Train_encoded_mean].fillna(population_mean_tr, inplace = True) #fillo con media di enc_mean sul Train
        X[self.Train_encoded_std].fillna(population_std_tr, inplace = True)   #fillo con media di enc_std sul Train
        
        return X

##### Verifico che ottengo stesso risultato sul train usando la Classe

In [220]:
copy_train

Unnamed: 0,letter,label
0,A,1
1,B,4
2,A,2
3,B,0
4,A,0
5,C,6


In [221]:
targetc = KFoldTargetEncoderTrain_std('letter', 'label' , n_fold= 3)
copy_train_enc = targetc.fit_transform(copy_train)
copy_train_enc

train and val indexes per fold:  [2 3 4 5] [0 1]
train and val indexes per fold:  [0 1 4 5] [2 3]
train and val indexes per fold:  [0 1 2 3] [4 5]
Correlation between the new feature, letter_enc_mean and, label is -0.2835991832999803.


Unnamed: 0,letter,label,letter_enc_mean,letter_enc_std
0,A,1,1.0,1.414214
1,B,4,0.0,2.401388
2,A,2,0.5,0.707107
3,B,0,4.0,2.401388
4,A,0,1.5,0.707107
5,C,6,2.166667,2.401388


### Corretto, ho ottenuto lo stesso di prima. Ora verifico per il Test

In [222]:
copy_train_enc.mean()

label              2.166667
letter_enc_mean    1.527778
letter_enc_std     1.672099
dtype: float64

In [223]:
test_targetc = KFoldTargetEncoderTest_std(copy_train_enc, 'letter', 'letter_enc_mean', 'letter_enc_std')
copy_test_enc= test_targetc.fit_transform(copy_test)
copy_test_enc

Filled 2 Missing Values with Average mean and std equals to 1.5277777777777777, 1.6720987644128902


Unnamed: 0,letter,label,letter_enc_mean,letter_enc_std
0,A,10,1.0,0.942809
1,D,8,1.527778,1.672099


In [224]:
test

Unnamed: 0,letter,label,letter_encoded_mean,letter_encoded_std,ENCODED WITH NOISE
0,A,10,1.0,0.942809,1.474121
1,D,8,1.527778,1.672099,-0.554467


### Corretto anche questo!! :)