In [29]:
from sklearn.feature_extraction import FeatureHasher
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import TransformerMixin
from itertools import combinations
from collections import OrderedDict
from sklearn.compose import ColumnTransformer

In [24]:
'''
Converts a vector (list) of binary values to its corresponding base 10 (decimal) representation.
'''
def convert_to_base10(vector):
    return int(str('').join(vector.astype(str)),2)

# np.apply_along_axis(convert_to_base10, 1, np.array(arr))

In [34]:
'''
Feature transformer 
''' 
class featureXformer(TransformerMixin):
    def __init__(
        self,
        domain_dims
    ):
        domain_dims = OrderedDict(domain_dims)
        self.domain_dims = domain_dims
        self.num_domains = len(domain_dims)
        self.base_01_encoders = {}
        
        for d,dim in domain_dims.items():
            self.base_01_encoders[d] = OneHotEncoder(categories = np.arange(dim).astype(int), drop=None, sparse=True)
            
        self.base_01_encoders = ColumnTransformer(self.base_01_encoders)
            
        self.syn_feature_ohEncoders = []
        self.base_domain_names = list(domain_dims.keys())
        self.syn_domain_dims = {}
        for i in range(self.num_domains):
            for j in range(i+1,self.num_domains):
                dim_ij = domain_dims[self.base_domain_names[i]] + domain_dims[self.base_domain_names[j]]
                domain_ij = '_'.join(self.base_domain_names[i],self.base_domain_names[j])
                # set up for ColumnTransformer
                self.syn_feature_ohEncoders.append(
                    ('xform_{}'.format(dim_ij),
                    OneHotEncoder(categories = np.arange(dim_ij).astype(int), drop=None),
                    dim_ij)
                )
                self.syn_domain_dims[domain_ij] = dim_ij
        self.syn_feature_ohEncoders = ColumnTransformer(self.syn_feature_ohEncoders)
        return 
    
    def fit(self,X):
        return 
    
    # -------------------------
    # Input is  X :: Pandas dataframe [ num_samples, num_domains]
    # Columns should be ordered
    # -------------------------
    def transform(self, df_X, one_hot = False):
        
        for dom in self.base_domain_names:
            df_X[dom] = df_X[dom].astype('categorical')
            
        df_X = ColumnTransformer.fit_transform(df_X)
        arr_col_names = []
        arr_x_ij = []
        for i in range(self.num_domains):
            for j in range(i+1,self.num_domains):
                dom_i = self.base_domain_names[i]
                dom_j = self.base_domain_names[j]
                x_i = df_X[dom_i].values.reshape([-1,1])
                x_j = df_X[dom_j].values.reshape([-1,1])
                _x_i = self.base_01_encoders[dom_i].fit_tansform(x_i)
                _x_j = self.base_01_encoders[dom_j].fit_tansform(x_j)
                _x_ij = np.hstack([x_i,x_j])
                x_ij = np.apply_along_axis(convert_to_base10, 1, _x_ij)
                arr_x_ij.append(x_ij)
                arr_col_names.append('_'.join([dom_i,dom_j]))
        X_ij = np.concatenate(arr_x_ij, axis=1)
        df_x_ij = pd.DataFrame(
            data =  X_ij.astype(int),
            columns = arr_col_names
        )
        if one_hot:
            df_x_ij = self.syn_feature_ohEncoders.fit_transform(df_x_ij)   
        return df_x_ij
    
     