# percentage coding speeup
a nicer example here for  http://stackoverflow.com/questions/43771213/

In [1]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

## create a minimal sample dataset

In [2]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'name': ['A', 'B', 'C', 'D', 'E'],
        'nationality': ['DE', 'AUT', 'US', 'US', 'US'],
        'alotdifferent': ['x', 'y', 'z', 'x', 'a'],
        'target': [0,0,0,1,1],
        'age_group' : [1, 2, 1, 3, 1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
df_a.nationality = df_a.nationality.astype('category')
df_a.alotdifferent = df_a.alotdifferent.astype('category')
df_a.name = df_a.name.astype('category')
df_a

Unnamed: 0,subject_id,name,nationality,alotdifferent,target,age_group
0,1,A,DE,x,0,1
1,2,B,AUT,y,0,2
2,3,C,US,z,0,1
3,4,D,US,x,1,3
4,5,E,US,a,1,1


## define transformer

In [3]:
class PercentageTransformer(TransformerMixin):
    def __init__(self, colname, typePercentage='totalTarget', _target='target', _dropOriginal=True):
        self.colname = colname
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage

    def fit(self, X, y, **kwargs):
        original = pd.concat([y, X], axis=1)
        grouped = original.groupby([self.colname, self._target]).size()
        if self.typePercentage == 'totalTarget':
            df = grouped / original[self._target].sum()
        else:
            df = (grouped / grouped.groupby(level=0).sum())

        if self.typePercentage == 'totalTarget':
            nameCol = "pre_" + self.colname
        else:
            nameCol = "pre2_" + self.colname
        self.nameCol = nameCol
        grouped = df.reset_index(name=nameCol)
        groupedOnly = grouped[grouped[self._target] == 1]
        groupedOnly = groupedOnly.drop(self._target, 1)

        self.result = groupedOnly
        return self

    def transform(self, dataF):
        mergedThing = pd.merge(dataF, self.result, on=self.colname, how='left')
        mergedThing.loc[(mergedThing[self.nameCol].isnull()), self.nameCol] = 0
        if self._dropOriginal:
            mergedThing = mergedThing.drop(self.colname, 1)
        return mergedThing


class PercentageAllTransformer(TransformerMixin):
    def __init__(self, columnsToTransform, _colBiasDrop=None, typePercentage='totalTarget', _target='target',
                 _dropOriginal=True):
        # print(
        #    'WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false')
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage
        self.transformers = {}
        self.colsToBias = columnsToTransform
        self._colBiasDrop = _colBiasDrop

    def fit(self, X, y, **kwargs):
        if self._colBiasDrop is not None:
            colToIterate = self.colsToBias.union(self._colBiasDrop)
        else:
            colToIterate = self.colsToBias
        for col in colToIterate:
            # print('percentage fit ' + col)
            myTransf = PercentageTransformer(col, typePercentage=self.typePercentage, _target=self._target,
                                             _dropOriginal=False)  # deliberately set to False to enable multi-drop
            self.transformers[col] = myTransf.fit(X, y)
        return self

    def transform(self, X):
        transformed = None
        for col in self.colsToBias:
            # print('percentage transform ' + col)
            if transformed is None:
                transformed = self.transformers[col].transform(X)
            else:
                intermediate = self.transformers[col].transform(X)
                if self.typePercentage == 'totalTarget':
                    intermediate = intermediate[['pre_' + col]]
                else:
                    intermediate = intermediate[['pre2_' + col]]
                transformed = pd.concat([transformed, intermediate], axis=1)
        if self._dropOriginal:
            transformed = transformed.drop(self.colsToBias, axis=1)
        if self._colBiasDrop is not None:
            transformed = transformed.drop(self._colBiasDrop, axis=1)
        return transformed

In [4]:
FACTOR_FIELDS = df_a.select_dtypes(include=['category']).columns
columnsToDrop = ['alotdifferent']
columnsToBias_keep = FACTOR_FIELDS[~FACTOR_FIELDS.isin(columnsToDrop)]
print(columnsToBias_keep)
factors = Pipeline([
                ('contrast1', PercentageAllTransformer(columnsToBias_keep.union(columnsToDrop), _colBiasDrop=None,
                                                   _dropOriginal=False, typePercentage='totalTarget')),
                ('contrast2',
                 PercentageAllTransformer(columnsToBias_keep, _colBiasDrop=columnsToDrop, _dropOriginal=True,
                                          typePercentage='groupwise')),
            ])

Index(['name', 'nationality'], dtype='object')


## view result

In [7]:
factors.fit_transform(df_a.drop(['target'], axis=1), df_a['target'])

Unnamed: 0,subject_id,age_group,pre_alotdifferent,pre_name,pre_nationality,pre2_name,pre2_nationality
0,1,1,0.5,0.0,0.0,0.0,0.0
1,2,2,0.0,0.0,0.0,0.0,0.0
2,3,1,0.0,0.0,1.0,0.0,0.666667
3,4,3,0.5,0.5,1.0,1.0,0.666667
4,5,1,0.5,0.5,1.0,1.0,0.666667


## open questions
- how to speed up?
- how to parallelize columns? (or is there a better way to speed up?)