# percentage coding speeup
a nicer example here for  http://stackoverflow.com/questions/43771213/

In [2]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from dask import dataframe as dd

## create a minimal sample dataset

In [None]:
# original one
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'name': ['A', 'B', 'C', 'D', 'E'],
        'nationality': ['DE', 'AUT', 'US', 'US', 'US'],
        'alotdifferent': ['x', 'y', 'z', 'x', 'a'],
        'target': [0,0,0,1,1],
        'age_group' : [1, 2, 1, 3, 1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
df_a.nationality = df_a.nationality.astype('category')
df_a.alotdifferent = df_a.alotdifferent.astype('category')
df_a.name = df_a.name.astype('category')

df_a

In [41]:
# to illustrate the question
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'name': ['A', 'A', 'C', 'A', 'C'],
        'nationality': ['DE', 'AUT', 'US', 'US', 'US'],
        'alotdifferent': ['x', 'y', 'z', 'x', 'a'],
        'target': [0,0,0,1,1],
        'age_group' : [1, 2, 1, 3, 1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
df_a.nationality = df_a.nationality.astype('category')
df_a.alotdifferent = df_a.alotdifferent.astype('category')
df_a.name = df_a.name.astype('category')

df_a

Unnamed: 0,subject_id,name,nationality,alotdifferent,target,age_group
0,1,A,DE,x,0,1
1,2,A,AUT,y,0,2
2,3,C,US,z,0,1
3,4,A,US,x,1,3
4,5,C,US,a,1,1


## define transformer

### TODO - refactoring for dask - maybe? if it turns out to be more efficient
1. define simple task transformer replacement / dropin
2. create full pipeline in dask
2.1 calculate *perentatge1* and *percentage2* in a single pass
3. automatically determine parallelism level

In [4]:
class DaskPercentageTransformer(TransformerMixin):
    def __init__(self, colname, typePercentage='totalTarget', _target='target', _dropOriginal=True, parallelism=8):
        self.colname = colname
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage
        self.parallelism = parallelism

    def fit(self, X, y, **kwargs):
        original = dd.from_pandas(pd.concat([y, X], axis=1), self.parallelism)
        grouped = original.groupby([self.colname, self._target]).size()
        if self.typePercentage == 'totalTarget':
            df = grouped / original[self._target].sum()
        else:
            df = (grouped / grouped.groupby(level=0).sum())

        if self.typePercentage == 'totalTarget':
            nameCol = "pre_" + self.colname
        else:
            nameCol = "pre2_" + self.colname
        self.nameCol = nameCol
        grouped = df.reset_index(name=nameCol)
        groupedOnly = grouped[grouped[self._target] == 1]
        groupedOnly = groupedOnly.drop(self._target, 1)

        self.result = groupedOnly.compute()
        return self

    def transform(self, dataF):
        mergedThing = dd.from_pandas(pd.merge(dataF, self.result, on=self.colname, how='left'), self.parallelism)
        mergedThing.loc[(mergedThing[self.nameCol].isnull()), self.nameCol] = 0
        if self._dropOriginal:
            mergedThing = mergedThing.drop(self.colname, 1)
        return mergedThing.compute()

In [8]:
class PercentageTransformer(TransformerMixin):
    def __init__(self, colname, typePercentage='totalTarget', _target='target', _dropOriginal=True):
        self.colname = colname
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage

    def fit(self, X, y, **kwargs):
        original = pd.concat([y, X], axis=1)
        grouped = original.groupby([self.colname, self._target]).size()
        if self.typePercentage == 'totalTarget':
            df = grouped / original[self._target].sum()
        else:
            df = (grouped / grouped.groupby(level=0).sum())

        if self.typePercentage == 'totalTarget':
            nameCol = "pre_" + self.colname
        else:
            nameCol = "pre2_" + self.colname
        self.nameCol = nameCol
        grouped = df.reset_index(name=nameCol)
        groupedOnly = grouped[grouped[self._target] == 1]
        groupedOnly = groupedOnly.drop(self._target, 1)

        self.result = groupedOnly
        return self

    def transform(self, dataF):
        mergedThing = pd.merge(dataF, self.result, on=self.colname, how='left')
        mergedThing.loc[(mergedThing[self.nameCol].isnull()), self.nameCol] = 0
        if self._dropOriginal:
            mergedThing = mergedThing.drop(self.colname, 1)
        return mergedThing


class PercentageAllTransformer(TransformerMixin):
    def __init__(self, columnsToTransform, _colBiasDrop=None, typePercentage='totalTarget', _target='target',
                 _dropOriginal=True, variant='pd'):
        # print(
        #    'WARN: mutually exclusive options: either _colBiasDrop is None and then _dropOriginal can be True or _colBiasDrop is set and _dropOriginal must be false')
        self._target = _target
        self._dropOriginal = _dropOriginal
        self.typePercentage = typePercentage
        self.transformers = {}
        self.colsToBias = columnsToTransform
        self._colBiasDrop = _colBiasDrop
        self.variant = variant

    def fit(self, X, y, **kwargs):
        if self._colBiasDrop is not None:
            colToIterate = self.colsToBias.union(self._colBiasDrop)
        else:
            colToIterate = self.colsToBias
        for col in colToIterate:
            # print('percentage fit ' + col)
            if (self.variant == 'pd'):
                myTransf = PercentageTransformer(col, typePercentage=self.typePercentage, _target=self._target,
                                                 _dropOriginal=False)  # deliberately set to False to enable multi-drop
            if (self.variant == 'dask'):
                myTransf = DaskPercentageTransformer(col, typePercentage=self.typePercentage, _target=self._target,
                                                 _dropOriginal=False)  # deliberately set to False to enable multi-drop
            self.transformers[col] = myTransf.fit(X, y)
        return self

    def transform(self, X):
        transformed = None
        for col in self.colsToBias:
            # print('percentage transform ' + col)
            if transformed is None:
                transformed = self.transformers[col].transform(X)
            else:
                intermediate = self.transformers[col].transform(X)
                if self.typePercentage == 'totalTarget':
                    intermediate = intermediate[['pre_' + col]]
                else:
                    intermediate = intermediate[['pre2_' + col]]
                transformed = pd.concat([transformed, intermediate], axis=1)
        if self._dropOriginal:
            transformed = transformed.drop(self.colsToBias, axis=1)
        if self._colBiasDrop is not None:
            transformed = transformed.drop(self._colBiasDrop, axis=1)
        return transformed

In [39]:
FACTOR_FIELDS = df_a.select_dtypes(include=['category']).columns
columnsToDrop = ['alotdifferent']
columnsToBias_keep = FACTOR_FIELDS[~FACTOR_FIELDS.isin(columnsToDrop)]
print(columnsToBias_keep)
factors = Pipeline([
                ('contrast1', PercentageAllTransformer(columnsToBias_keep.union(columnsToDrop), _colBiasDrop=None,
                                                   _dropOriginal=False, typePercentage='totalTarget')),
                ('contrast2',
                 PercentageAllTransformer(columnsToBias_keep, _colBiasDrop=columnsToDrop, _dropOriginal=False,
                                          typePercentage='groupwise')),
            ])

factorsDask = Pipeline([
                ('contrast1', PercentageAllTransformer(columnsToBias_keep.union(columnsToDrop), _colBiasDrop=None,
                                                   _dropOriginal=False, typePercentage='totalTarget', variant='dask')),
                ('contrast2',
                 PercentageAllTransformer(columnsToBias_keep, _colBiasDrop=columnsToDrop, _dropOriginal=True,
                                          typePercentage='groupwise', variant='dask')),
            ])

Index(['name', 'nationality'], dtype='object')


## view result

In [42]:
%time factors.fit_transform(df_a.drop(['target'], axis=1), df_a['target'])

CPU times: user 49.9 ms, sys: 1.93 ms, total: 51.8 ms
Wall time: 50.4 ms


Unnamed: 0,subject_id,name,nationality,age_group,pre_alotdifferent,pre_name,pre_nationality,pre2_name,pre2_nationality
0,1,A,DE,1,0.5,0.5,0.0,0.333333,0.0
1,2,A,AUT,2,0.0,0.5,0.0,0.333333,0.0
2,3,C,US,1,0.0,0.5,1.0,0.5,0.666667
3,4,A,US,3,0.5,0.5,1.0,0.333333,0.666667
4,5,C,US,1,0.5,0.5,1.0,0.5,0.666667


In [12]:
# does not yet work as outlined in SF
#%time factorsDask.fit_transform(df_a.drop(['target'], axis=1), df_a['target'])

## open questions
- how to speed up?
- how to parallelize columns? (or is there a better way to speed up?)

## dask playground

In [31]:
original = df_a
colname = 'name'
target = 'target'
grouped = original.groupby([colname, target]).size()
df = grouped / original[target].sum()
nameCol = "pre_" + colname
result = df.reset_index(name=nameCol)
print(result)

mergedThing = pd.merge(df_a, result, on=colname, how='left')
mergedThing.loc[(mergedThing[nameCol].isnull()), nameCol] = 0
mergedThing

  name  target  pre_name
0    A       0       0.5
1    B       0       0.5
2    C       0       0.5
3    D       1       0.5
4    E       1       0.5


Unnamed: 0,subject_id,name,nationality,alotdifferent,target_x,age_group,target_y,pre_name
0,1,A,DE,x,0,1,0,0.5
1,2,B,AUT,y,0,2,0,0.5
2,3,C,US,z,0,1,0,0.5
3,4,D,US,x,1,3,1,0.5
4,5,E,US,a,1,1,1,0.5


In [34]:
original = df_a
colname = 'nationality'
target = 'target'
grouped = original.groupby([colname, target]).size()
df = grouped / original[target].sum()
nameCol = "pre_" + colname
result = df.reset_index(name=nameCol)
print(result)

#mergedThing = 
print(df_a.shape)
pd.merge(df_a, result, on=colname, how='left')
#mergedThing.loc[(mergedThing[nameCol].isnull()), nameCol] = 0
#mergedThing

  nationality  target  pre_nationality
0         AUT       0              0.5
1          DE       0              0.5
2          US       0              0.5
3          US       1              1.0
(5, 6)


Unnamed: 0,subject_id,name,nationality,alotdifferent,target_x,age_group,target_y,pre_nationality
0,1,A,DE,x,0,1,0,0.5
1,2,B,AUT,y,0,2,0,0.5
2,3,C,US,z,0,1,0,0.5
3,3,C,US,z,0,1,1,1.0
4,4,D,US,x,1,3,0,0.5
5,4,D,US,x,1,3,1,1.0
6,5,E,US,a,1,1,0,0.5
7,5,E,US,a,1,1,1,1.0


In [19]:
original = dd.from_pandas(df_a,8)
grouped = original.groupby(['name', 'target']).size()
df = grouped / original['target'].sum()
nameCol = "pre_" + 'name'

In [None]:
https://jcrist.github.io/dask-sklearn-part-3.html single pass

calculate both aggregations in single pass
http://stackoverflow.com/questions/12589481/python-pandas-multiple-aggregations-of-the-same-column

In [24]:
df.reset_index(nameCol).compute()

0    0.5
1    0.5
2    0.5
3    0.5
4    0.5
dtype: float64

In [20]:
df.reset_index(name=nameCol)

TypeError: reset_index() got an unexpected keyword argument 'name'

In [13]:

grouped = df.reset_index(name=nameCol)
groupedOnly = grouped[grouped[self._target] == 1]
groupedOnly = groupedOnly.drop(self._target, 1)

self.result = groupedOnly.compute()

mergedThing = dd.from_pandas(pd.merge(dataF, self.result, on=self.colname, how='left'), self.parallelism)
mergedThing.loc[(mergedThing[self.nameCol].isnull()), self.nameCol] = 0
if self._dropOriginal:
    mergedThing = mergedThing.drop(self.colname, 1)
mergedThing.compute()

In [15]:
myTransf.fit(df_a.drop(['target'], axis=1), df_a['target'])

TypeError: reset_index() got an unexpected keyword argument 'name'

In [None]:
df = dd.from_pandas(df_a, 8)
df

In [None]:
df.target.sum().visualize()

In [None]:
df_a

In [None]:
df.groupby('target').size().compute()

In [None]:
grouped = df.groupby(['name', 'target']).size()
(grouped / df.target.sum()).compute()

In [None]:
original = pd.concat([y, X], axis=1)
        grouped = original.groupby([self.colname, self._target]).size()
        if self.typePercentage == 'totalTarget':
            df = grouped / original[self._target].sum()
        else:
            df = (grouped / grouped.groupby(level=0).sum())

        if self.typePercentage == 'totalTarget':
            nameCol = "pre_" + self.colname
        else:
            nameCol = "pre2_" + self.colname
        self.nameCol = nameCol
        grouped = df.reset_index(name=nameCol)
        groupedOnly = grouped[grouped[self._target] == 1]
        groupedOnly = groupedOnly.drop(self._target, 1)

        self.result = groupedOnly
        return self

    def transform(self, dataF):
        mergedThing = pd.merge(dataF, self.result, on=self.colname, how='left')
        mergedThing.loc[(mergedThing[self.nameCol].isnull()), self.nameCol] = 0
        if self._dropOriginal:
            mergedThing = mergedThing.drop(self.colname, 1)