# <font color = green> Construire son propre Transformer ( sklearn ) </font>

In [1]:
import numpy as np 
import pandas as pd
import sklearn as skl

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
#from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

In [11]:
myarray = np.array([ [ 1 , 1 , 3 , 'v' , 'a' , 'x' , 0 ] , 
                     [ 2 , 2 , 2 , 'v' , 'b' , 'y' , 1 ] ,
                     [ 4 , 5 , 1 , 'w' , 'c' , 'z' , 1 ] ,
                     [ 2 , 1 , 9 , 'w' , 'c' , 'x' , 1 ] , 
                     [ 1 , 0 , 4 , 'w' , 'b' , 'y' , 1 ] ,
                     [ 2 , 2 , 3 , 'v' , 'b' , 'x' , 0 ] ] )

colnames = [ 'one', 'two', 'three' , 'four' , 'five' , 'six' , 'target' ]

df = pd.DataFrame( myarray , columns = colnames )

myarray2 = np.array([ [ 2 , 7 , 3 , 'v' , 'a' , 'x' , 0 ] , 
                      [ 9 , 2 , 2 , 'v' , 'a' , 'y' , 0 ] ,
                      [ 4 , 5 , 1 , 'w' , 'c' , 'k' , 1 ] ]  )

colnames2 = [ 'one', 'two', 'three' , 'four' , 'five' , 'six' , 'target' ]

df2 = pd.DataFrame( myarray2 , columns = colnames2 )

In [7]:
df

Unnamed: 0,one,two,three,four,five,six,target
0,1,1,3,v,a,x,0
1,2,2,2,v,b,y,1
2,4,5,1,w,c,z,1
3,2,1,9,w,c,x,1
4,1,0,4,w,b,y,1
5,2,2,3,v,b,x,0


In [12]:
df2

Unnamed: 0,one,two,three,four,five,six,target
0,2,7,3,v,a,x,0
1,9,2,2,v,a,y,0
2,4,5,1,w,c,k,1


In [30]:
df[ 'target' ] = df[ 'target' ].astype( 'float64' )

In [6]:
df

Unnamed: 0,one,two,three,four,five,six,target
0,1,1,3,v,a,x,0.0
1,2,2,2,v,b,y,1.0
2,4,5,1,w,c,z,1.0
3,2,1,9,w,c,x,1.0
4,1,0,4,w,b,y,1.0
5,2,2,3,v,b,z,0.0


#### I.1 - Sélection de colonnes :

In [7]:
class FeatureSelector( BaseEstimator, TransformerMixin ):
 
    def __init__( self, col2keep ) :
        self._col2keep = col2keep 
      
    def fit( self, X, y = None ):
        return self 
    
    
    def transform( self, X, y = None ) :
        return X[ self._col2keep ] 

#### I.2 - Sélection de colonnes suivant leur type :

In [8]:
class TypeSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform( self , X ):
        assert isinstance( X , pd.DataFrame )
        return X.select_dtypes( include = [ self.dtype ] )

#### II.1 - Conversion de colonnes en numérique :

In [9]:
class Cast2Num( BaseEstimator , TransformerMixin ) :
    def __init__( self , col2trans ) :
        self._col2trans = col2trans
        
    def fit( self, X, y = None ):
        return self 
        
    def transform( self , X , y = None ) :
        X[ self._col2trans ] = X[ self._col2trans ].astype( 'float64' )
        return X

#### II.2 - Conversion de colonnes en catégories :

In [10]:
class Cast2Cat( BaseEstimator , TransformerMixin ) :
    
    def __init__( self , col2trans ) :
        self._col2trans = col2trans
        
    def fit( self, X, y = None ):
        return self 
        
    def transform( self , X , y = None ) :
        X[ self._col2trans ] = X[ self._col2trans ].astype( 'category' )
        return X

#### III.1 - Remplacer les modalités d'une variable catégorielle par leur ratio :

In [14]:
class Cat2Rat( BaseEstimator , TransformerMixin ) :
    
    def __init__( self , col2trans ) :
        self._col2trans = col2trans


    def fit( self , X , y = None ):
        self._dic_col_p = {}

        for col in self._col2trans :
            p = X[ col ].value_counts() / X.shape[0]
            dic = dict( [ ( i , p[i] ) for i in ( X[ col ].value_counts() ).index ] )
            self._dic_col_p.update( { col : dic } )
        return self


    def transform( self , X , y = None ) :

        for col , dic_p in self._dic_col_p.items() : 
            print( 'colonne : {0} | dictionnaire : {1}'.format( col , dic_p ) )
            X.replace( { col : dic_p } , inplace = True )
        return X

#### <font color = red> III.2 - Remplacer les modalités d'une variable catégorielle par la moyenne d'une variable cible au sein de chaque modalité : </font>

In [37]:
class Cat2TargetMean( BaseEstimator , TransformerMixin ) :
    
    """
    Replace values of a categorical variable by the mean of target variable
    
     - col2trans : list of catagorical columns to transform
     - tgt_col : target variable
    """
    
    def __init__( self , col2trans , tgt_col ) :
        self._col2trans = col2trans
        self._tgt_col = tgt_col
        
    def fit( self , X , y = None ) :
        self._dfl_val = X[ self._tgt_col ].mean()
        self._dic_col_p = {}
        
        for col in self._col2trans :
            p = X.groupby( col ).mean()[ self._tgt_col ]
            self._dic_col_p.update( { col : p.to_dict() } )
        return self 
        
    def transform( self , X , y = None ) :
        for col , dic_p in self._dic_col_p.items() : 
            #X.replace( { col : dic_p } , inplace = True )  
            print( dic_p )
            X[ col ] = X[ col ].map( dic_p ).fillna( self._dfl_val )
        return X

#### IV.1 - Remplacer les valeurs Nan par le mode :

In [13]:
class Nan2Mod( BaseEstimator , TransformerMixin ) :
    
    def __init__( self , col2trans ) :
        self._col2trans = col2trans
        
    def fit( self, X, y = None ):
        return self 
        
    def transform( self , X , y = None ) :
        for col in self._col2trans :
            X[ col ] = X[ col ].fillna( X[ col ].mode()[0] ) 
        return X

#### IV.2 - Remplacer les valeurs Nan par la moyenne :

In [14]:
class Nan2Mean( BaseEstimator , TransformerMixin ) :
    
    def __init__( self , col2trans ) :
        self._col2trans = col2trans
        
    def fit( self, X, y = None ):
        return self 
        
    def transform( self , X , y = None ) :
        for col in self._col2trans :
            X[ col ] = X[ col ].fillna( X[ col ].mean() )  
        return X

In [15]:
class Nan2Func( BaseEstimator , TransformerMixin ) :
    
    def __init__( self , col2trans , func ) :
        self._col2trans = col2trans
        self._func = func
        
    def fit( self, X, y = None ):
        return self 
        
    def transform( self , X , y = None ) :
        print( func )
        for col in self._col2trans :
            X[ col ] = X[ col ].fillna( X[ col ].func )  

### <font color = 'blue'> Transformations </font>

### <font color = 'orange'> Passage en numérique </font>

In [32]:
cast = Cast2Num( [ 'one' , 'two' , 'three' , 'target' ] )

cast.transform( df )

cast.transform( df2 )

Unnamed: 0,one,two,three,four,five,six,target
0,2.0,7.0,3.0,v,a,x,0.0
1,9.0,2.0,2.0,v,a,y,0.0
2,4.0,5.0,1.0,w,c,k,1.0


In [15]:
df2.dtypes

one       float64
two       float64
three     float64
four       object
five       object
six        object
target    float64
dtype: object

### <font color = 'orange'> Variables catégorielles : modalités remplacées par moyenne variable cible au sein modalité </font>

In [38]:
tsf = Cat2TargetMean( [ 'four' , 'five' , 'six' ] , 'target' )

tsf.fit( df )

tsf.transform( df )

{'v': 0.3333333333333333, 'w': 1.0}
{'a': 0.0, 'b': 0.6666666666666666, 'c': 1.0}
{'x': 0.5, 'y': 1.0, 'z': 0.5}


Unnamed: 0,one,two,three,four,five,six,target
0,1.0,1.0,3.0,0.333333,0.0,0.5,0.0
1,2.0,2.0,2.0,0.333333,0.666667,1.0,1.0
2,4.0,5.0,1.0,1.0,1.0,0.5,1.0
3,2.0,1.0,9.0,1.0,1.0,0.5,1.0
4,1.0,0.0,4.0,1.0,0.666667,1.0,1.0
5,2.0,2.0,3.0,0.333333,0.666667,0.5,0.0


In [36]:
df2[ 'six' ].map( {'x': 0.5, 'y': 1.0, 'z': 0.5} ).fillna( df[ 'target' ].mean() )

0    0.500000
1    1.000000
2    0.666667
Name: six, dtype: float64

In [39]:
tsf.transform( df2 )

{'v': 0.3333333333333333, 'w': 1.0}
{'a': 0.0, 'b': 0.6666666666666666, 'c': 1.0}
{'x': 0.5, 'y': 1.0, 'z': 0.5}


Unnamed: 0,one,two,three,four,five,six,target
0,2.0,7.0,3.0,0.333333,0.0,0.5,0.0
1,9.0,2.0,2.0,0.333333,0.0,1.0,0.0
2,4.0,5.0,1.0,1.0,1.0,0.666667,1.0


### <font color = 'orange'> Variables catégorielles : modalités remplacées proportion </font>

In [15]:
tsf = Cat2Rat( [ 'six' ] )

tsf.fit( df )

tsf.transform( df2 )

colonne: six | dictionnaire: {'x': 0.5, 'y': 0.3333333333333333, 'z': 0.16666666666666666}


Unnamed: 0,one,two,three,four,five,six,target
0,2,7,3,v,a,0.5,0
1,9,2,2,v,a,0.333333,0
2,4,5,1,w,c,k,1


## <font color = 'red'>Pipeline</font>

https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
    
https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/

In [19]:
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline

In [20]:
num_var = [ 'one' , 'two' , 'three' ]
cat_var = [ 'four' , 'five' , 'six' ]

tgt_var = [ 'target' ]

In [85]:
# pipeline : variables catégorielles

steps = [ ( 'cat_var_sel' , FeatureSelector( cat_var ) ) ,
          ( 'cast2cat' , Cast2Cat( cat_var ) ) ,
          ( 'cat_var_tsf' , Cat2TargetMean( cat_var , 'target' ) ) ]

pip_tsf_cat_var = Pipeline( steps )

steps = [ ( 'num_var_sel' , FeatureSelector( num_var ) ) , 
          ( 'cast2num' , Cast2Num( num_var ) ) ,
          ( 'num_var_tsf' , Nan2Mean( num_var ) ) ]

pip_tsf_num_var = Pipeline( steps )

In [86]:
# Combinaison des pipelines

steps = [ ( 'pip_tsf_cat_var' , pip_tsf_cat_var ) ,
          ( 'pip_tsf_num_var' , pip_tsf_num_var ) ]

pip_tsf = FeatureUnion( steps )

In [87]:
X_tsf = pip_tsf.transform( X )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


AttributeError: 'Cat2TargetMean' object has no attribute '_dic_col_p'

In [89]:
c2tm = Cat2TargetMean( cat_var , 'target' )

c2tm.fit( df )

Cat2TargetMean(col2trans=None, tgt_col=None)

In [79]:
X_tsf

array([['v', 'a', 1.0, 1.0, 3.0],
       ['v', 'b', 2.0, 2.0, 2.0],
       ['w', 'c', 4.0, 5.0, 1.0],
       ['w', 'c', 2.0, 1.0, 9.0],
       ['w', 'b', 1.0, 0.0, 4.0]], dtype=object)

## <font color = 'red'>Pipeline & Classification </font>

https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [68]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

In [69]:
X = df.drop( 'target' , axis = 1 )

y = df[ 'target' ].values

In [70]:
X_train, X_test, y_train, y_test = train_test_split( X , y , test_size = 0.1 , random_state = 123 )

In [71]:
steps =  [ ( 'pip_tsf' , pip_tsf ) ,
           ( 'clf' , RandomForestClassifier() ) ]

pip = Pipeline( steps )

In [72]:
pip.fit( X_train , y_train )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


DataError: No numeric types to aggregate

In [None]:
y_pred = pip.predict( X_test )

In [63]:
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    # 
    def __init__(self, col2trans ):
        self._col2trans = col2trans
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self

    
    #Helper function that converts values to Binary depending on input 
    def bin_col(self, obj):
        if obj == 0:
            return 'No'
        else:
            return 'Yes'
    
    #Transformer : valeur 0 remplacée par No, Yes sinon 
    def transform(self, X , y = None ):
        for col in self._col2trans :
            X.loc[ : , col ] = X[ col ].apply( self.bin_col )    
        return X.values 

In [64]:
class NumericalTransformer( BaseEstimator, TransformerMixin ) :
    
    def __init__(self, col2trans ):
        self._col2trans = col2trans
        
    def transform( self , X , y = None ) :
        for col in self._col2trans :
            new_col = 'p_' + str( col )
            X[ new_col ] = X[ col ] / X[ col ].sum()

In [68]:
ns = NumericalTransformer( [ 'one' , 'two' , 'three'] )

ns.transform( mydataframe )

In [69]:
mydataframe

Unnamed: 0,one,two,three,p_one,p_two,p_three
a,1.0,,3.0,0.2,,1.0
b,4.0,5.0,0.0,0.8,1.0,0.0


In [70]:
fs = FeatureSelector( col2keep = [ 'two' , 'three' ] )

fs.transform( mydataframe )

Unnamed: 0,two,three
a,,3.0
b,5.0,0.0
