In [1]:
%load_ext autoreload
%autoreload 2
    
import collections

import sklearn
import pandas as pd
from sklearn import preprocessing
import numpy as np

## One hot encoder sklearn

In [2]:
df_ = pd.DataFrame([{'c1':'a', 'c2':'c'},{'c1':'b', 'c2':'d'},{'c1':'a', 'c2':'e'} ])
df_

Unnamed: 0,c1,c2
0,a,c
1,b,d
2,a,e


In [3]:
oh_sk = sklearn.preprocessing.OneHotEncoder()

In [4]:
oh_sk.fit(df_)

OneHotEncoder()

In [5]:
oh_sk.categories_

[array(['a', 'b'], dtype=object), array(['c', 'd', 'e'], dtype=object)]

In [6]:
oh_sk.transform(df_)

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [7]:
oh_sk.feature_names_in_

array(['c1', 'c2'], dtype=object)

In [8]:
oh_sk.get_feature_names_out()

array(['c1_a', 'c1_b', 'c2_c', 'c2_d', 'c2_e'], dtype=object)

## One hot encoder in column transformer

In [9]:
df_ = pd.DataFrame([{'c1':'a', 'c2':'c', 'c3':2.32},
                    {'c1':'b', 'c2':'d', 'c3':1.1},
                    {'c1':'a', 'c2':'e', 'c3':1.1} ])


In [10]:
df_

Unnamed: 0,c1,c2,c3
0,a,c,2.32
1,b,d,1.1
2,a,e,1.1


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_trans = ColumnTransformer(
    [('onehot', OneHotEncoder(dtype='int'),['c1','c2'])],
    remainder='passthrough',
    sparse_threshold=1.)

In [12]:
column_trans.fit(df_)

ColumnTransformer(remainder='passthrough', sparse_threshold=1.0,
                  transformers=[('onehot', OneHotEncoder(dtype='int'),
                                 ['c1', 'c2'])])

In [13]:
X_encoded = column_trans.transform(df_)
X_encoded

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [14]:
column_trans.transform(df_).todense()

matrix([[1.  , 0.  , 1.  , 0.  , 0.  , 2.32],
        [0.  , 1.  , 0.  , 1.  , 0.  , 1.1 ],
        [1.  , 0.  , 0.  , 0.  , 1.  , 1.1 ]])

In [15]:
column_trans.get_feature_names_out()

array(['onehot__c1_a', 'onehot__c1_b', 'onehot__c2_c', 'onehot__c2_d',
       'onehot__c2_e', 'remainder__c3'], dtype=object)

In [16]:
column_trans.transformers_

[('onehot', OneHotEncoder(dtype='int'), ['c1', 'c2']),
 ('remainder', 'passthrough', [2])]

In [17]:

df_[['c1','c2']]

Unnamed: 0,c1,c2
0,a,c
1,b,d
2,a,e


## StreamOnehotencoder with partial fit

We could implement a `StreamOneHot` class with a `partial_fit` that keeps updating a dict with the found values.

In [18]:
categories = collections.defaultdict(set)
categories['c1'].update(['b','b','a','c'])
categories['c2'].update(['a','b','c','d','e','f'])

In [19]:
categories

defaultdict(set, {'c1': {'a', 'b', 'c'}, 'c2': {'a', 'b', 'c', 'd', 'e', 'f'}})

In [20]:
for col in categories.keys():
    categories_col = list(categories[col])
    #categories_col.sort()
    categories[col]= categories_col
    
encoders = []
for k,v in categories.items():
    encoder = OneHotEncoder()
    encoder.fit(pd.DataFrame({k:v}))
    encoders.append(encoder)

In [21]:
print(encoders[0].get_feature_names_out())
print(encoders[1].get_feature_names_out())

['c1_a' 'c1_b' 'c1_c']
['c2_a' 'c2_b' 'c2_c' 'c2_d' 'c2_e' 'c2_f']


In [22]:
print(encoders[0].categories_)
print(encoders[1].categories_)

[array(['a', 'b', 'c'], dtype=object)]
[array(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)]


In [23]:
a = 4

In [24]:
a if a>2 else 123

4

In [25]:
from sklearn.base import TransformerMixin, BaseEstimator

import scipy.sparse as sp

class StreamOneHotEncoder(TransformerMixin, BaseEstimator):
    
    def __init__(self, columns=None):
        self.columns = columns
        
    def _initialize_cols(self):
        if 'categories_per_col_' not in self.__dict__:
            self.categories_per_col_ = collections.defaultdict(set)
    
    def fit(self,X,y=None):
        self._initialize_cols()
        
        self.partial_fit(X,y)
    
    def fit_transform(self, X, y=None):
        self.fit(X,y)
        return self.transform(X)
    
    def partial_fit(self, X, y=None):
        self._initialize_cols()
        
        if self.columns is None:
            if isinstance(X, pd.DataFrame):
                self.columns = list(X.columns)
                
        for col in self.columns:
            self.categories_per_col_[col].update(X[col])
        
    def _prepare_onehotencoders(self):            
        self.onehotencoders_ = {}
        for col,values in self.categories_per_col_.items():
            encoder = OneHotEncoder()
            encoder.fit(pd.DataFrame({col: list(values)}))
            self.onehotencoders_[col] = encoder
            
    def transform(self, X):
        if 'onehotencoders_' not in self.__dict__:
            self._prepare_onehotencoders()
        
        results = []
        for col in self.columns:
            res = self.onehotencoders_[col].transform(X[[col]])
            results.append(res)
        
        return sp.hstack(results)
    
    def get_feature_names_out(self):
        if 'onehotencoders_' not in self.__dict__:
            self._prepare_onehotencoders()

        names = []
        for col in self.columns:
            names.extend(self.onehotencoders_[col].get_feature_names_out())

        return np.array(names)
            

One can use .fit

In [26]:
soh = StreamOneHotEncoder(columns=['c1','c2'])
soh.partial_fit(df_)

In [27]:
print(soh.categories_per_col_)

defaultdict(<class 'set'>, {'c1': {'a', 'b'}, 'c2': {'e', 'c', 'd'}})


In [28]:
print(soh.get_feature_names_out())

['c1_a' 'c1_b' 'c2_c' 'c2_d' 'c2_e']


In [29]:
print(soh.categories_per_col_)
print(soh.get_feature_names_out())
display(soh.__dict__)

defaultdict(<class 'set'>, {'c1': {'a', 'b'}, 'c2': {'e', 'c', 'd'}})
['c1_a' 'c1_b' 'c2_c' 'c2_d' 'c2_e']


{'columns': ['c1', 'c2'],
 'categories_per_col_': defaultdict(set,
             {'c1': {'a', 'b'}, 'c2': {'c', 'd', 'e'}}),
 'onehotencoders_': {'c1': OneHotEncoder(), 'c2': OneHotEncoder()}}

One can also use partial fit

In [30]:
soh = StreamOneHotEncoder(columns=['c1','c0'])

soh.__dict__

{'columns': ['c1', 'c0']}

In [31]:
soh = StreamOneHotEncoder(columns=['c1','c2'])

n_rows = df_.shape[0]

for i in range(n_rows):
    soh.partial_fit(df_.iloc[[i]])
    print(soh.categories_per_col_)

defaultdict(<class 'set'>, {'c1': {'a'}, 'c2': {'c'}})
defaultdict(<class 'set'>, {'c1': {'a', 'b'}, 'c2': {'c', 'd'}})
defaultdict(<class 'set'>, {'c1': {'a', 'b'}, 'c2': {'e', 'c', 'd'}})


In [32]:
soh.transform(df_)

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [33]:
soh.get_feature_names_out()

array(['c1_a', 'c1_b', 'c2_c', 'c2_d', 'c2_e'], dtype='<U4')

We can also instanciate a new object from a previous set of hyperparameters

In [34]:
soh_from_params = StreamOneHotEncoder(soh.get_params())

In [35]:
soh_from_params.get_params()

{'columns': {'columns': ['c1', 'c2']}}

In [36]:
soh_from_params.columns

{'columns': ['c1', 'c2']}

In [37]:
soh_from_params

StreamOneHotEncoder(columns={'columns': ['c1', 'c2']})

## ColumnTransformer with fit  



In [38]:
column_trans = ColumnTransformer(
    [('onehotc1', StreamOneHotEncoder(),['c1']),
     ('onehotc2', StreamOneHotEncoder(),['c2'])    
    ],
    remainder='passthrough',
    sparse_threshold=1.)

We can iterate over transformers and partial fit them in the column of data we care about, which is given by the column provided

In [39]:
print(column_trans.transformers[0])
print(column_trans.transformers[1])

('onehotc1', StreamOneHotEncoder(), ['c1'])
('onehotc2', StreamOneHotEncoder(), ['c2'])


In [40]:
#column_trans.transformers[1][1].fit_transform(df_[['c1']])

In [41]:
column_trans.fit(df_)

ColumnTransformer(remainder='passthrough', sparse_threshold=1.0,
                  transformers=[('onehotc1', StreamOneHotEncoder(), ['c1']),
                                ('onehotc2', StreamOneHotEncoder(), ['c2'])])

In [42]:
df_

Unnamed: 0,c1,c2,c3
0,a,c,2.32
1,b,d,1.1
2,a,e,1.1


In [43]:
column_trans.transform(df_)

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

#### See equivalence with OneHotEncoder

In [44]:
from sklearn.preprocessing import OneHotEncoder
column_trans = ColumnTransformer(
    [('onehotc1', OneHotEncoder(),['c1']),
     ('onehotc2', OneHotEncoder(),['c2'])    
    ],
    remainder='passthrough',
    sparse_threshold=1.)

In [45]:
column_trans.fit(df_)

ColumnTransformer(remainder='passthrough', sparse_threshold=1.0,
                  transformers=[('onehotc1', OneHotEncoder(), ['c1']),
                                ('onehotc2', OneHotEncoder(), ['c2'])])

In [46]:
column_trans.transform(df_)

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

## StreamColumnTransformer with fit  partial fit

We could implement a `StreamOneHot` class with a `partial_fit` that keeps updating a dict with the found values.

In [47]:
from stream_column_transformer import StreamColumnTransformer

In [48]:
column_trans = StreamColumnTransformer(
    [('onehotc1', StreamOneHotEncoder(),['c1']),
     ('onehotc2', StreamOneHotEncoder(),['c2'])    
    ],
    remainder='passthrough',
    sparse_threshold=1.)

In [49]:

n_rows = df_.shape[0]

for i in range(n_rows):
    print(f'{display(df_.iloc[[i]])} ')
    column_trans.partial_fit(df_.iloc[[i]])
    print(column_trans.transformers[0][1].categories_per_col_)
    print(column_trans.transformers[1][1].categories_per_col_)

Unnamed: 0,c1,c2,c3
0,a,c,2.32


None 
defaultdict(<class 'set'>, {'c1': {'a'}})
defaultdict(<class 'set'>, {'c2': {'c'}})


Unnamed: 0,c1,c2,c3
1,b,d,1.1


None 
defaultdict(<class 'set'>, {'c1': {'a', 'b'}})
defaultdict(<class 'set'>, {'c2': {'c', 'd'}})


Unnamed: 0,c1,c2,c3
2,a,e,1.1


None 
defaultdict(<class 'set'>, {'c1': {'a', 'b'}})
defaultdict(<class 'set'>, {'c2': {'e', 'c', 'd'}})
