# lightgbm categorical problem

Regular dataset has around 300k rows with around 3% positives of the binary target class. It consists of regular numeric columns, ordered categoricals encoded as 1,2,3 and factor variables which are dummy-coded. Only the ones which do not have a too large number of factors are used as factors for the dummy coding. The other ones are transformed into a numeric space e.g. by calculating a percentage of the target class contained in this factor variable. In total (after the dummy coding) there are a little bit less than 10k columns.

Below I tried to generate a sample df, Howver did not (yet) succed.

### skip the imports - please see the problem below. Finally I could reproduce it

In [1]:
import logging
logger = logging.getLogger(__name__)
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from pylightgbm.models import GBMClassifier
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder

_seed=49

def transformToXy(data):
    # Extract the data
    X = data.drop(['NEVERPAYER'], axis=1)
    # Extract the labels
    y = data['NEVERPAYER']
    return X, y

def labelEncodeCategoricalData(df):
    df_copy = df.copy()
    cat_cols = df_copy.columns[df_copy.dtypes == 'category'].values
    df_copy[cat_cols] = df_copy[cat_cols].apply(lambda x: x.cat.codes)
    return df_copy

class ColumnExtractor(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, *_):
        # print('selecting columns ', self.columns)
        return X[self.columns]

    def fit(self, X, *_):
        return self
    
#########################################################
# Percentage transformer perform basic effects coding of categorical variables

class PercentageTransformer(TransformerMixin):
    def __init__(self, colname, typePercentage='totalTarget', _target='NEVERPAYER', _dropOriginal=True):
        self.colname = colname
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage

    def fit(self, X, y, *_):
        original = pd.concat([y, X], axis=1)
        grouped = original.groupby([self.colname, self._target]).size()
        if self.typePercentage == 'totalTarget':
            df = grouped / original[self._target].sum()
        else:
            df = (grouped / grouped.groupby(level=0).sum())

        if self.typePercentage == 'totalTarget':
            nameCol = "pre_" + self.colname
        else:
            nameCol = "pre2_" + self.colname
        self.nameCol = nameCol
        grouped = df.reset_index(name=nameCol)
        groupedOnly = grouped[grouped[self._target] == 1]
        groupedOnly = groupedOnly.drop(self._target, 1)

        self.result = groupedOnly
        return self

    def transform(self, dataF):
        mergedThing = pd.merge(dataF, self.result, on=self.colname, how='left')
        mergedThing.loc[(mergedThing[self.nameCol].isnull()), self.nameCol] = 0
        if self._dropOriginal:
            mergedThing = mergedThing.drop(self.colname, 1)
        return mergedThing


class PercentageAllTransformer(TransformerMixin):
    def __init__(self, columnsToTransform, _colBiasDrop=None, typePercentage='totalTarget', _target='NEVERPAYER',
                 _dropOriginal=True):
        print(
            'WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false')
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage
        self.transformers = {}
        self.colsToBias = columnsToTransform
        self._colBiasDrop = _colBiasDrop

        logger.debug("colums to bias " + str(self.colsToBias))
        logger.debug("colums to drop " + str(self._colBiasDrop))

    def fit(self, X, y, *_):
        if self._colBiasDrop is not None:
            colToIterate = self.colsToBias.union(self._colBiasDrop)
        else:
            colToIterate = self.colsToBias
        for col in colToIterate:
            myTransf = PercentageTransformer(col, typePercentage=self.typePercentage, _target=self._target,
                                             _dropOriginal=False)  # deliberately set to False to enable multi-drop
            self.transformers[col] = myTransf.fit(X, y)
        return self

    def transform(self, X):
        logger.info('percentage transforming cols')
        transformed = None
        for col in self.colsToBias:
            if transformed is None:
                transformed = self.transformers[col].transform(X)
            else:
                intermediate = self.transformers[col].transform(X)
                if self.typePercentage == 'totalTarget':
                    intermediate = intermediate[['pre_' + col]]
                else:
                    intermediate = intermediate[['pre2_' + col]]
                transformed = pd.concat([transformed, intermediate], axis=1)
        if self._dropOriginal:
            transformed = transformed.drop(self.colsToBias, axis=1)
        if self._colBiasDrop is not None:
            transformed = transformed.drop(self._colBiasDrop, axis=1)
        return transformed

In [2]:
targetdf = pd.DataFrame(np.random.randint(0,2,(100000,1)),columns=['NEVERPAYER'])
XDF = pd.DataFrame(np.random.randint(0,100,(1000000,10)),columns=list('ABCDEFGHIJ'))
mydf = pd.concat([targetdf, XDF], join='inner', axis=1)

mydf['A'] = mydf['A'].astype('category')
mydf['B'] = mydf['B'].astype('category')
mydf['C'] = mydf['C'].astype('category')
mydf['D'] = mydf['D'].astype('category')

mydf.loc[(mydf.E < 50), 'E_smaller50_only01'] = 1
mydf['E_smaller50_only01'] = mydf['E_smaller50_only01'].fillna(0)

#mydf = pd.get_dummies(mydf, sparse=False)
X, y = transformToXy(mydf)

CONTINUOUS_FIELDS = X.select_dtypes(include=['number']).columns  # includes orderd factors
FACTOR_FIELDS = X.select_dtypes(include=['category']).columns
FACTOR_FIELDS = X.select_dtypes(include=['category']).columns
columnsToDrop = ['C'] # chosen randomly, however in the real dataset 
columnsToBias_keep = FACTOR_FIELDS[~FACTOR_FIELDS.isin(columnsToDrop)]
#these columns must be bias-coded. Otherwise the dataframe is blown up too big by dummy coding

X.shape

(100000, 11)

In [3]:
X_original = X.copy()
X = labelEncodeCategoricalData(X) 

In [4]:
pathToLightGBM = '~/neverpayer/lightgbm'
clf = GBMClassifier(exec_path=pathToLightGBM, 
                              num_iterations=100, learning_rate=0.1,
                              num_leaves=2000, min_data_in_leaf=100, metric='binary_logloss',
                              feature_fraction=0.7, bagging_fraction=0.7, bagging_freq=0,
                              metric_freq=1, early_stopping_round=20,
                             tree_learner="serial", num_threads=4,
                              is_unbalance=False)

prediction_pipe = Pipeline([
        ('extract', ColumnExtractor(FACTOR_FIELDS)),
        ('bias1', PercentageAllTransformer(columnsToBias_keep.union(columnsToDrop), _colBiasDrop=None,
                                          _dropOriginal=False, typePercentage='totalTarget')),
         ('bias2',
         PercentageAllTransformer(columnsToBias_keep, _colBiasDrop=columnsToDrop, _dropOriginal=False,
                                  typePercentage='groupwise')),
        ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore')),
        ('estimator', clf)
    ])

WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false
WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false


## error
as you can see this results in columns which contain only few values as they are binary encoded


In [5]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=_seed)
for train_index, test_index in split.split(X, y):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]
    
    prediction_pipe.fit(X_train, y_train)
    y_predicted = prediction_pipe.predict(X_test)
    print('found x number of positive class labels ', y_predicted.sum())

[LightGBM] [Info] Loading parameters .. finished
[LightGBM] [Error] Feature Column_300 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_301 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_302 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_303 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_304 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_305 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_306 only contains one value, will be ignored
[LightGBM] [Info] Finish loading data, use 0.569019 seconds
[LightGBM] [Info] Number of postive:40188,  number of negative:39812
[LightGBM] [Info] Number of data:80000, Number of features:300
[LightGBM] [Info] Finish training initilization.
[LightGBM] [Info] Start train ...
[LightGBM] [Info] cannot find more split with gain = -inf , current #leaves=199
[LightGBM] [Info] 0.065842 seconds elapsed,

# A deeper look at the columns
why are features ignored?
 
 > [LightGBM] [Error] Feature Column_300 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_301 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_302 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_303 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_304 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_305 only contains one value, will be ignored
[LightGBM] [Error] Feature Column_306 only contains one value, will be ignored
## look at the data

In [6]:
X.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'E_smaller50_only01'], dtype='object')

In [7]:
X_original.dtypes

A                     category
B                     category
C                     category
D                     category
E                        int64
F                        int64
G                        int64
H                        int64
I                        int64
J                        int64
E_smaller50_only01     float64
dtype: object

## bias coding
as some levels are huge > 100 I use percentage based bias coding to reduce the levels

In [8]:
FACTOR_FIELDS

Index(['A', 'B', 'C', 'D'], dtype='object')

In [9]:
columnsToBias_keep

Index(['A', 'B', 'D'], dtype='object')

In [10]:
columnsToDrop

['C']

In [11]:
codingPipe = Pipeline([
        ('bias1', PercentageAllTransformer(columnsToBias_keep.union(columnsToDrop), _colBiasDrop=None,
                                          _dropOriginal=False, typePercentage='totalTarget')),
         ('bias2',
         PercentageAllTransformer(columnsToBias_keep, _colBiasDrop=columnsToDrop, _dropOriginal=False,
                                  typePercentage='groupwise')),
    ])
codingPipe.fit(X,y)
X = codingPipe.transform(X)

WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false
WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false


as you can see some additional columns are generated
- one for biasCol for type1
- one per biasCol for type2
- as biasCol have a maximum of X levels << 1k we additionally dummy-code the variables
- but there are variables like `C`which have >1k levels. These should be retained as `pre_C` shows, but the initial `C` column is dropped

In [12]:
X.head()

Unnamed: 0,A,B,D,E,F,G,H,I,J,E_smaller50_only01,pre_A,pre_B,pre_C,pre_D,pre2_A,pre2_B,pre2_D
0,86,86,28,1,9,48,19,54,48,1.0,0.010093,0.009894,0.010212,0.010212,0.491756,0.501514,0.504425
1,28,28,45,30,48,35,82,71,35,1.0,0.009436,0.010749,0.010272,0.010909,0.486653,0.515759,0.536729
2,17,85,12,5,12,27,65,39,8,1.0,0.010093,0.009356,0.011008,0.010013,0.511604,0.5,0.521244
3,26,33,22,44,55,18,48,60,49,1.0,0.010232,0.009913,0.009834,0.009814,0.501463,0.503539,0.518947
4,25,87,10,61,53,33,80,47,58,0.0,0.010192,0.009615,0.009038,0.009794,0.51927,0.479167,0.502041


Now dummycoding should be performed on the remaining categorical variables of A,B,D

In [13]:
onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot.fit(X[['A', 'B']])
onehotres = onehot.transform(X[['A', 'B']])
print(onehotres)
onehotres = pd.DataFrame(onehotres)
onehotres[0].value_counts()

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


0.0    99003
1.0      997
Name: 0, dtype: int64

which results in only 0's and 1es being present in the column