In [345]:
import collections

import sklearn
import pandas as pd
from sklearn import preprocessing

## One hot encoder sklearn

In [408]:
df_ = pd.DataFrame([{'c1':'a', 'c2':'c'},{'c1':'b', 'c2':'d'},{'c1':'a', 'c2':'e'} ])
df_

Unnamed: 0,c1,c2
0,a,c
1,b,d
2,a,e


In [409]:
oh_sk = sklearn.preprocessing.OneHotEncoder()

In [410]:
oh_sk.fit(df_)

OneHotEncoder()

In [411]:
oh_sk.categories_

[array(['a', 'b'], dtype=object), array(['c', 'd', 'e'], dtype=object)]

In [412]:
oh_sk.transform(df_)

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [413]:
oh_sk.feature_names_in_

array(['c1', 'c2'], dtype=object)

In [414]:
oh_sk.get_feature_names_out()

array(['c1_a', 'c1_b', 'c2_c', 'c2_d', 'c2_e'], dtype=object)

## One hot encoder in column transformer

In [426]:
df_ = pd.DataFrame([{'c1':'a', 'c2':'c', 'c3':2.32},
                    {'c1':'b', 'c2':'d', 'c3':1.1},
                    {'c1':'a', 'c2':'e', 'c3':1.1} ])


In [427]:
df_

Unnamed: 0,c1,c2,c3
0,a,c,2.32
1,b,d,1.1
2,a,e,1.1


In [417]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_trans = ColumnTransformer(
    [('onehot', OneHotEncoder(dtype='int'),['c1','c2'])],
    remainder='passthrough',
    sparse_threshold=1.)

In [418]:
column_trans.fit(df_)

ColumnTransformer(remainder='passthrough', sparse_threshold=1.0,
                  transformers=[('onehot', OneHotEncoder(dtype='int'),
                                 ['c1', 'c2'])])

In [420]:
X_encoded = column_trans.transform(df_)
X_encoded

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [91]:
column_trans.transform(df_).todense()

matrix([[1.  , 0.  , 1.  , 0.  , 0.  , 2.32],
        [0.  , 1.  , 0.  , 1.  , 0.  , 1.1 ],
        [1.  , 0.  , 0.  , 0.  , 1.  , 1.1 ]])

In [89]:
column_trans.get_feature_names_out()

array(['onehot__c1_a', 'onehot__c1_b', 'onehot__c2_c', 'onehot__c2_d',
       'onehot__c2_e', 'remainder__c3'], dtype=object)

In [92]:
column_trans.transformers_

[('onehot', OneHotEncoder(dtype='int'), ['c1', 'c2']),
 ('remainder', 'passthrough', [2])]

In [96]:

df_[['c1','c2']]

Unnamed: 0,c1,c2
0,a,c
1,b,d
2,a,e


## StreamOnehotencoder with partial fit

We could implement a `StreamOneHot` class with a `partial_fit` that keeps updating a dict with the found values.

In [365]:
categories = collections.defaultdict(set)
categories['c1'].update(['b','b','a','c'])
categories['c2'].update(['a','b','c','d','e','f'])

In [366]:
categories

defaultdict(set, {'c1': {'a', 'b', 'c'}, 'c2': {'a', 'b', 'c', 'd', 'e', 'f'}})

In [367]:
for col in categories.keys():
    categories_col = list(categories[col])
    #categories_col.sort()
    categories[col]= categories_col
    
encoders = []
for k,v in categories.items():
    encoder = OneHotEncoder()
    encoder.fit(pd.DataFrame({k:v}))
    encoders.append(encoder)

In [368]:
print(encoders[0].get_feature_names_out())
print(encoders[1].get_feature_names_out())

['c1_a' 'c1_b' 'c1_c']
['c2_a' 'c2_b' 'c2_c' 'c2_d' 'c2_e' 'c2_f']


In [371]:
print(encoders[0].categories_)
print(encoders[1].categories_)

[array(['a', 'b', 'c'], dtype=object)]
[array(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)]


In [481]:
from sklearn.base import TransformerMixin
import scipy.sparse as sp

class StreamOneHotEncoding(TransformerMixin):
    
    def __init__(self, columns=None):
        self.columns = columns
        self.categories_per_col = collections.defaultdict(set)

    def _initialize_cols(self, X):
        
        for col in self.columns:
            self.categories_per_col = categories_per_col
    
    def partial_fit(self, X, y=None):
        if self.columns is None:
            if isinstance(X, pd.DataFrame):
                self.columns = list(X.columns)
                
        for col in self.columns:
            self.categories_per_col[col].update(X[col])
        
    def _prepare_onehotencoders(self):            
        self.onehotencoders_ = {}
        for col,values in self.categories_per_col.items():
            encoder = OneHotEncoder()
            encoder.fit(pd.DataFrame({col: list(values)}))
            self.onehotencoders_[col] = encoder
            
    def transform(self, X):
        if 'onehotencoders_' not in self.__dict__:
            self._prepare_onehotencoders()
        
        results = []
        for col in self.columns:
            res = self.onehotencoders_[col].transform(X[[col]])
            results.append(res)
        
        return sp.hstack(results)
    
    def get_feature_names_out(self):
        names = []
        for col in self.columns:
            names.extend(self.onehotencoders_[col].get_feature_names_out())

        return np.array(names)
            

In [482]:
soh = StreamOneHotEncoding(columns=['c1','c2'])

n_rows = df_.shape[0]

for i in range(n_rows):
    soh.partial_fit(df_.iloc[[i]])
    print(soh.categories_per_col)


defaultdict(<class 'set'>, {'c1': {'a'}, 'c2': {'c'}})
defaultdict(<class 'set'>, {'c1': {'b', 'a'}, 'c2': {'d', 'c'}})
defaultdict(<class 'set'>, {'c1': {'b', 'a'}, 'c2': {'e', 'd', 'c'}})


In [483]:
soh.transform(df_)

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [484]:
soh.categories_per_col

defaultdict(set, {'c1': {'a', 'b'}, 'c2': {'c', 'd', 'e'}})

In [486]:
soh.get_feature_names_out()

array(['c1_a', 'c1_b', 'c2_c', 'c2_d', 'c2_e'], dtype='<U4')

## StreamColumnTransformer with partial fit

We could implement a `StreamOneHot` class with a `partial_fit` that keeps updating a dict with the found values.

In [297]:
column_trans = ColumnTransformer(
    [('onehotc1', OneHotEncoder(dtype='int'),['c1']),
     ('onehotc2', OneHotEncoder(dtype='int'),['c2'])    
    ],
    remainder='passthrough',
    sparse_threshold=1.)

We can iterate over transformers and partial fit them in the column of data we care about, which is given by the column provided

In [299]:
column_trans.transformers[0]

('onehotc1', OneHotEncoder(dtype='int'), ['c1'])

In [300]:
column_trans.transformers[1]

('onehotc2', OneHotEncoder(dtype='int'), ['c2'])