In [2]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.preprocessing import Imputer
from sklearn.base import TransformerMixin

In [9]:
test_data =  pd.DataFrame({
    'c': ['c1', np.nan, 'c1', 'c1'],
    'd': ['c2', np.nan, 'c3', 'c3'],
})
    
expected_data =  pd.DataFrame({
    'c': ['c1', 'c1', 'c1', 'c1'],
    'd': ['c2', 'c3', 'c3', 'c3'],
})

In [13]:
test_data2 = pd.DataFrame({
    'c':[1,np.nan,2,3],
    'd':[1,np.nan,2,2]
})

In [10]:
imp = Imputer(strategy='most_frequent')

In [16]:
imp.fit_transform(test_data2)

array([[ 1.,  1.],
       [ 1.,  2.],
       [ 2.,  2.],
       [ 3.,  2.]])

In [17]:
test_data

Unnamed: 0,c,d
0,c1,c2
1,,
2,c1,c3
3,c1,c3


## Most frequent value Imputer for categorical columns

In [44]:
class TopValueImputer(TransformerMixin):
    
    def __init__(self):
        self.top_values = {} # {'nazwa_kolumny': najczęstsza wartość}
    
    def fit(self, X, y=None):
        # zapamiętanie najczęstszego poziomu dla każdej z kolumn
        for col in X.columns:
            counts = X[col].value_counts()
            if counts.shape[0] == 0:
                self.top_values[col] = 'nan'
                continue
            top_count = counts.argmax()
            self.top_values[col] = top_count            
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        # uzupełnienie danych odpowiednią wartością
        for col in X.columns:
            X[col] = X[col].fillna(self.top_values[col])
        return X


Caly preprocessing w formie pipelineow zapamietujemy wartosc imputera a potem stosujemy na nowej obserwacji, danych testowych

In [53]:
my_imputer = TopValueImputer()

In [54]:
my_imputer.fit_transform(test_data)

Unnamed: 0,c,d
0,c1,c2
1,c1,c3
2,c1,c3
3,c1,c3


## One hot encoding for categorical columns

In [45]:
class RobustOneHotEncoding(TransformerMixin):
    def __init__(self):
        self.columns = None
    
    def fit(self, X, y=None):
        # zapamiętanie, jakie kolumny mogą pojawić się z one hot encoding
        output = pd.get_dummies(X)
        self.columns = output.columns
        return self
    
    def transform(self, X, y=None):
        # zrobić pd.get_dummies i uzupełnić brakujące kolumny, ewentualnie usunąć nadmiarowe
        output = pd.get_dummies(X)

        for col in output.columns:
            if col not in self.columns:
                output.drop(col,axis=1,inplace=True)
            else:
                continue
                
        for col in self.columns:
            if col not in output.columns:
                output[col] = np.nan
                
        return output

In [46]:
fit_data = pd.DataFrame({
    'cat_column': ['c1', 'c2']
})
expected_fit_transform_data = pd.DataFrame({
    'cat_column_c1' : [1, 0],
    'cat_column_c2' : [0, 1],
})

test_data = pd.DataFrame({
    'cat_column': ['c1', 'c3']
})

expected_test_data = pd.DataFrame({
    'cat_column_c1' : [1, 0],
    'cat_column_c2' : [0, 0],
})

In [47]:
enc = RobustOneHotEncoding()

In [48]:
enc.fit(fit_data)

<__main__.RobustOneHotEncoding at 0x7f0469da7710>

In [49]:
fit_data

Unnamed: 0,cat_column
0,c1
1,c2


In [50]:
test_data

Unnamed: 0,cat_column
0,c1
1,c3


In [51]:
enc.transform(test_data)

Unnamed: 0,cat_column_c1,cat_column_c2
0,1,
1,0,


## Keep most frequent levels in categorical columns

In [106]:
class KeepFrequentLevels(TransformerMixin):
    def __init__(self, k=2):
        self.k = k
        self.frequent_values = {} # nazwa_kolumny: lista_poziomów
        
    def fit(self, X, y=None):
        # znaleźć k najczęściej występujących poziomów dla każdej kolumny
        for col in X.columns:
            self.frequent_values[col] = X[col].value_counts()[0:self.k].index

        return self
    
    def transform(self, X, y=None):
        # wszystkie poziomy, które nie występują często, zamienić na 'rare'
        X = X.copy()
        
        for col in X.columns:
            X[col] = X[col].apply(lambda x: x if x in self.frequent_values[col] else 'rare')
        
        return X


In [79]:
test_data = pd.DataFrame({
    'c': ['c1', 'c1', 'c1', 'c2', 'c2', 'c3', 'c4', 'c5']
})
## k = 2
expected_data = pd.DataFrame({
    'c': ['c1', 'c1', 'c1', 'c2', 'c2', 'rare', 'rare', 'rare']
})

In [80]:
test_data.columns

Index(['c'], dtype='object')

In [84]:
test_data['c'].value_counts()[0:2].index

Index(['c1', 'c2'], dtype='object')

In [102]:
kfl = KeepFrequentLevels()

In [103]:
kfl.fit(test_data)

<__main__.KeepFrequentLevels at 0x7f625e839a58>

In [104]:
kfl.frequent_values

{'c': Index(['c1', 'c2'], dtype='object')}

In [105]:
kfl.transform(test_data)

Unnamed: 0,c
0,c1
1,c1
2,c1
3,c2
4,c2
5,rare
6,rare
7,rare


In [108]:
from sklearn.utils import murmurhash3_32

## Hash categorical colmuns

In [109]:
class HashCategorical(TransformerMixin):
    
    def __init__(self, k=10):
        self.k = k
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        #na każdej obserwacji w X zastosować funkcję murmurhash3_32
        X_transformed = X.applymap(lambda x: murmurhash3_32(x) % self.k)
        return X_transformed  