# Scaling categorical features

Some features are categorical. Don't scale them.

**Unless I'm doing something wrong, it actually does not seem to make any difference**

In [106]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np

# First 2 features are informative. `weights` controls class balance.
X, y = make_classification(n_samples=10000,
                           n_features=4, n_classes=2,
                           n_clusters_per_class=1,
                           n_informative=2, n_redundant=0,
                           weights=None, random_state=42)

X[:, 0] = np.digitize(X[:, 0], bins=np.linspace(-4, 4, 9))

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [107]:
X[:25, 0]

array([6., 6., 6., 6., 6., 5., 4., 6., 3., 5., 3., 3., 5., 6., 5., 5., 6.,
       6., 3., 5., 5., 5., 4., 6., 4.])

Make another version where we make x_0 purely categorical, with no order.

I can't think of a way of doing this in NumPy, so Pandas it is.

In [108]:
import pandas as pd

shuf = {
    0: 3,
    1: 9,
    2: 1,
    3: 8,
    4: 0,
    5: 5,
    6: 2,
    7: 4,
    8: 6,
    9: 7,
}

s = pd.Series(X[:, 0])

X[:, 0] = s.replace(shuf)

In [109]:
X_shuf_train, X_shuf_test, y_shuf_train, y_shuf_test = train_test_split(X_shuf, y)

## Train models on these datasets

- What happens if I train on this scaled dataset? Presumably it's not too bad, since the variable is ordinal.
- What if I mix the categories (i.e. make them non-ordinal) and train on that? Presumably it's bad.

Train on scaled data.

In [110]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [111]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_sc, y_train)
y_hat = model.predict(X_test_sc)

In [112]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1236
           1       0.94      0.96      0.95      1264

    accuracy                           0.95      2500
   macro avg       0.95      0.95      0.95      2500
weighted avg       0.95      0.95      0.95      2500



Now same thing but with the shuffled categories.

In [113]:
scaler.fit(X_shuf_train)
X_shuf_train_sc = scaler.transform(X_shuf_train)
X_shuf_test_sc = scaler.transform(X_shuf_test)

model = LogisticRegression()
model.fit(X_shuf_train_sc, y_shuf_train)
y_hat = model.predict(X_shuf_test_sc)

print(classification_report(y_shuf_test, y_hat))

              precision    recall  f1-score   support

           0       0.49      0.52      0.51      1244
           1       0.49      0.46      0.48      1256

    accuracy                           0.49      2500
   macro avg       0.49      0.49      0.49      2500
weighted avg       0.49      0.49      0.49      2500



## And do it properly

We should dummy encode these things instead.

In [114]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

numeric_features = [1, 2, 3]
numeric_transformer = make_pipeline(StandardScaler())

categorical_features = [0]
categorical_transformer = make_pipeline(OneHotEncoder(drop='first'))

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipe = make_pipeline(preprocessor, LogisticRegression())

Now try datasets

In [115]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95      1236
           1       0.93      0.98      0.96      1264

    accuracy                           0.95      2500
   macro avg       0.96      0.95      0.95      2500
weighted avg       0.96      0.95      0.95      2500



In [116]:
pipe.fit(X_shuf_train, y_shuf_train)
y_hat = pipe.predict(X_shuf_test)

print(classification_report(y_shuf_test, y_hat))

ValueError: all features must be in [0, 1] or [-2, 0]