This notebook is a playground for me to test my custom transformers. Nothing important here.

In [1]:
import sys
sys.path.append('../scripts')

import numpy as np
import pandas as pd
import helpers_models as hm
import transforms
from sklearn.pipeline import Pipeline

In [2]:
data = hm.load_pivot_numbers().sample(200)

In [3]:
data = hm.make_target(data, data['tweets'].columns.max())

Test the class balancer. Try fitting it on the data and then transforming the data and the target.

In [4]:
balancer = transforms.ClassBalancer()

In [5]:
balancer.fit(data.drop('target', axis=1, level=0), data['target'])

ClassBalancer()

In [6]:
data_transformed = balancer.transform(data.drop('target', axis=1, level=0))
data_transformed.shape

(48, 52)

In [7]:
target_transformed = balancer.transform(data['target'])
target_transformed.shape

(48,)

In [8]:
np.all(data_transformed.index == target_transformed.index)

True

Good, class balancer works. Now let's test a simple transform pipeline that transforms the data and the target.

In [9]:
transform_pipe = Pipeline([
    ('normalize', transforms.Normalizer()),
    ('balance', transforms.ClassBalancer())
])
transform_pipe

Pipeline(steps=[('normalize', Normalizer()), ('balance', ClassBalancer())])

In [10]:
transform_pipe.fit(data.drop('target', axis=1, level=0), data['target'])

Pipeline(steps=[('normalize', Normalizer()), ('balance', ClassBalancer())])

In [11]:
data_transformed = transform_pipe.transform(data.drop('target', axis=1, level=0))
data_transformed.shape

(48, 52)

In [12]:
data_transformed.head()

Unnamed: 0_level_0,tweets,tweets,tweets,tweets,tweets,tweets,tweets,tweets,tweets,tweets,...,urls,urls,urls,urls,urls,urls,urls,urls,urls,urls
week,23,24,25,26,27,28,29,30,31,32,...,26,27,28,29,30,31,32,33,34,35
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
alymae,0.0,0.014706,0.037736,0.034247,0.035714,0.018519,0.015267,0.008065,0.020979,0.026027,...,0.02381,0.020833,0.027027,0.01,0.0,0.005376,0.029915,0.013761,0.015625,0.015504
adfm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015625,0.023256
doug_dvojack,0.0,0.0,0.0,0.0,0.0,0.0,0.022901,0.016129,0.01049,0.017808,...,0.0,0.0,0.0,0.02,0.015873,0.018817,0.008547,0.009174,0.0,0.031008
nonemaker,0.0,0.0,0.0,0.0,0.0,0.0,0.007634,0.0,0.008159,0.005479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
whateverjones,0.0,0.058824,0.081761,0.089041,0.028571,0.064815,0.076336,0.108871,0.061772,0.09726,...,0.047619,0.010417,0.0,0.06,0.079365,0.043011,0.038462,0.03211,0.072917,0.054264


In [13]:
target_transformed = transform_pipe.transform(data['target'])
target_transformed.shape

(48,)

In [14]:
target_transformed.head()

user
alymae           0.041667
adfm             0.041667
doug_dvojack     0.041667
nonemaker        0.041667
whateverjones    0.041667
Name: target, dtype: float64

The pipeline applies the normalizer to the booleans in the target, so we want to skip this step when transforming the target.

In [15]:
transform_pipe.set_params(normalize=None)

Pipeline(steps=[('normalize', None), ('balance', ClassBalancer())])

In [16]:
target_transformed = transform_pipe.transform(data['target'])
target_transformed.shape

(48,)

In [17]:
target_transformed.head()

user
alymae           True
adfm             True
doug_dvojack     True
nonemaker        True
whateverjones    True
Name: target, dtype: bool

Nice!

In [18]:
np.all(data_transformed.index == target_transformed.index)

True

Finally, let's test the weeks limiter transformer. We will change its parameters and see if it performs as expected.

In [19]:
pipe_weeks = Pipeline([
    ('weeks', transforms.WeeksLimiter(25, 28)),
    ('decay', transforms.TimeDecayApplier(28))
])

In [21]:
pipe_weeks.fit_transform(data).head()

Unnamed: 0_level_0,tweets,tweets,tweets,hashtags,hashtags,hashtags,mentions,mentions,mentions,urls,urls,urls,target
week,25,26,27,25,26,27,25,26,27,25,26,27,Unnamed: 13_level_1
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
_sabsk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
mrsfalaise25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
tiim_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
junidi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
ivana22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


Now change the parameters and observe.

In [25]:
pipe_weeks.set_params(weeks__first_week=23)

Pipeline(steps=[('weeks', WeeksLimiter(first_week=23, target_week=28)), ('decay', TimeDecayApplier(target_week=28))])

In [26]:
pipe_weeks.fit_transform(data).head()

Unnamed: 0_level_0,tweets,tweets,tweets,tweets,tweets,hashtags,hashtags,hashtags,hashtags,hashtags,...,mentions,mentions,mentions,mentions,urls,urls,urls,urls,urls,target
week,23,24,25,26,27,23,24,25,26,27,...,24,25,26,27,23,24,25,26,27,Unnamed: 21_level_1
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
_sabsk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
mrsfalaise25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
tiim_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
junidi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
ivana22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


It works!