# Processing data

[Documentation](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-data)

In [11]:
from sklearn import preprocessing
import pandas as pd

In [3]:
train_path = '../data/users_train.csv'
train_data = pd.read_csv(train_path)

# Numeric Transformation

$$f(x) = \frac{x-\bar x}{\sigma}$$

In [19]:
x_numeric = train_data[['cnt_user_engagement']]
x_numeric

Unnamed: 0,cnt_user_engagement
0,1
1,172
2,74
3,30
4,20
...,...
7185,13
7186,6
7187,39
7188,9


In [12]:
scaler = preprocessing.StandardScaler()

StandardScaler()

In [15]:
scaler.fit(x_numeric)

StandardScaler()

In [16]:
scaler.transform(x_numeric)

array([[-0.55610442],
       [ 2.60049074],
       [ 0.7914479 ],
       ...,
       [ 0.14536117],
       [-0.40842746],
       [-0.13153314]])

In [18]:
scaler.mean_, scaler.var_

(array([31.12545202]), array([2934.63739114]))

In [None]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
        ("scaler", StandardScaler()),
    ]
)


categorical_steps = [("imputer",SimpleImputer(strategy="most_frequent"))] 
if most_common:
    categorical_steps += [("most_common", MostCommonCategories(thr=most_common))]
categorical_steps += [("onehot", OneHotEncoder(categories="auto", handle_unknown="ignore"))]

categorical_transformer = Pipeline(steps=categorical_steps)

preprocessor = ColumnTransformer(
    transformers=[
        (
            "numeric_features",
            numeric_transformer if numeric else "drop",
            NUMERICAL_COLUMNS,
        ),
        (
            "categorical_features",
            categorical_transformer if categical else "drop",
            CATEGORICAL_COLUMNS,
        ),
        (
            "ignore_features",
            "drop",
            IGNORE_COLUMNS,
        ),
    ],
    # n_jobs=-1,
)