In [1]:
%load_ext autoreload
%autoreload 2

In [44]:
import pandas as pd
import numpy as np
import importlib
from datanooblol.experimenter.experiment_generator import ExperimentGenerator
from datanooblol.experimenter.imputer import SimpleImputer, StatisticsImputer, GroupImputer
from datanooblol.experimenter.scaler import Scaler
from datanooblol.experimenter.encoder import BinaryEncoder, OneHotEncoder, LabelEncoder

In [3]:
importlib.import_module(f"datanooblol.utils.calculator")._mean([1,2,3])

2.0

In [4]:
test_imputed = pd.DataFrame({
    "a":["a","a","b","b"],
    "b":[np.nan,1,2,np.nan],
    "c":["x",np.nan,"x","y"]
})

In [5]:
test_imputed

Unnamed: 0,a,b,c
0,a,,x
1,a,1.0,
2,b,2.0,x
3,b,,y


In [6]:
simImp = SimpleImputer(imputed_feature="b", imputed_value=9)

In [7]:
simImp.fit_transform(test_imputed)

Unnamed: 0,a,b,c
0,a,9.0,x
1,a,1.0,
2,b,2.0,x
3,b,9.0,y


In [8]:
statImp = StatisticsImputer(imputed_feature="b", statistics="mean")
statImp.fit_transform(test_imputed)

Unnamed: 0,a,b,c
0,a,1.5,x
1,a,1.0,
2,b,2.0,x
3,b,1.5,y


In [9]:
statImp = StatisticsImputer(imputed_feature="c", statistics="mode")
statImp.fit_transform(test_imputed)

Unnamed: 0,a,b,c
0,a,,x
1,a,1.0,x
2,b,2.0,x
3,b,,y


In [10]:
grpImp = GroupImputer(group_features=["a"], imputed_feature="b", statistics="mean")
grpImp.fit_transform(test_imputed)

Unnamed: 0,a,b,c
0,a,1.0,x
1,a,1.0,
2,b,2.0,x
3,b,2.0,y


In [11]:
grpImp.transform(test_imputed)

Unnamed: 0,a,b,c
0,a,1.0,x
1,a,1.0,
2,b,2.0,x
3,b,2.0,y


In [12]:
grpImp.imputed_value

Unnamed: 0,a,b
0,a,1.0
1,b,2.0


In [13]:
grpImp.imputed_feature

'b'

In [14]:
imp = importlib.import_module(f"datanooblol.experimenter.imputer")

In [15]:
imp.GroupImputer(group_features=["a"], imputed_feature="b", statistics="mean").fit_transform(test_imputed)

Unnamed: 0,a,b,c
0,a,1.0,x
1,a,1.0,
2,b,2.0,x
3,b,2.0,y


In [16]:
np.random.seed(555)
test_scaled = pd.DataFrame({
    "a": np.random.randint(1, 100, 100),
    "b": np.random.rand(100),
    "c": ["c"]*100
})
test_scaled

Unnamed: 0,a,b,c
0,27,0.159694,c
1,47,0.146141,c
2,34,0.882788,c
3,69,0.189545,c
4,39,0.334775,c
...,...,...,...
95,44,0.524644,c
96,45,0.688837,c
97,79,0.243586,c
98,14,0.890887,c


In [17]:
transformer = Scaler(scaled_features=["a","b"], scaler="min-max")
transformer.fit_transform(test_scaled)

Unnamed: 0,a,b,c
0,0.244681,0.155591,c
1,0.457447,0.141807,c
2,0.319149,0.891085,c
3,0.691489,0.185955,c
4,0.372340,0.333675,c
...,...,...,...
95,0.425532,0.526800,c
96,0.436170,0.693809,c
97,0.797872,0.240923,c
98,0.106383,0.899323,c


In [18]:
transformer = Scaler(scaled_features=["a","b"], scaler="standard")
transformer.fit_transform(test_scaled)

Unnamed: 0,a,b,c
0,-0.826026,-1.304343,c
1,-0.104606,-1.350572,c
2,-0.573529,1.162256,c
3,0.688956,-1.202515,c
4,-0.393174,-0.707110,c
...,...,...,...
95,-0.212819,-0.059433,c
96,-0.176748,0.500658,c
97,1.049666,-1.018170,c
98,-1.294949,1.189884,c


In [39]:
test_encoded = pd.DataFrame({
    "a":["a","a","b","b"],
    "b":["a","a","b","b"],
    "c":["a","a","b","b"]
})

In [40]:
binaryE = BinaryEncoder(encoded_feature="a", value_map={"a":0, "b":1})
binaryE.fit_transform(test_encoded)

Unnamed: 0,a,b,c
0,0,a,a
1,0,a,a
2,1,b,b
3,1,b,b


In [43]:
ohE = OneHotEncoder(encoded_features=["a","b","c"])
ohE.fit_transform(test_encoded)

Unnamed: 0,a,b,c,a_a,a_b,b_a,b_b,c_a,c_b
0,a,a,a,1.0,0.0,1.0,0.0,1.0,0.0
1,a,a,a,1.0,0.0,1.0,0.0,1.0,0.0
2,b,b,b,0.0,1.0,0.0,1.0,0.0,1.0
3,b,b,b,0.0,1.0,0.0,1.0,0.0,1.0


In [51]:
labelE = LabelEncoder(encoded_features="a")
labelE.fit_transform(test_encoded)

Unnamed: 0,a,b,c
0,0,a,a
1,0,a,a
2,1,b,b
3,1,b,b
