# Scaling categorical features

Some features are categorical. Don't scale them.

**Unless I'm doing something wrong, it actually does not seem to make any difference**

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np

X, y = make_classification(n_samples=10000, n_features=4, weights=[0.85])  # First 2 features are informative.

X[:, 0] = np.digitize(X[:, 0], bins=np.linspace(-4, 4, 9))

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [2]:
X[:25, 0]

array([4., 6., 7., 3., 4., 5., 5., 4., 7., 4., 5., 4., 5., 5., 4., 4., 4.,
       6., 2., 1., 3., 5., 4., 3., 5.])

Make another version where we make x_0 purely categorical, with no order.

In [22]:
X_shuf = np.copy(X)
X_shuf[:, 0] = (X_shuf[:, 0] + 7) % 9

X_shuf_train, X_shuf_test, y_shuf_train, y_shuf_test = train_test_split(X_shuf, y)

## Train models on these datasets

- What happens if I train on this scaled dataset? Presumably it's not too bad, since the variable is ordinal.
- What if I mix the categories (i.e. make them non-ordinal) and train on that? Presumably it's bad.

Train on scaled data.

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [24]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_sc, y_train)
y_hat = model.predict(X_test_sc)

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2132
           1       0.84      0.68      0.75       368

    accuracy                           0.93      2500
   macro avg       0.89      0.83      0.86      2500
weighted avg       0.93      0.93      0.93      2500



Now same thing but mix the categories first.

In [27]:
scaler.fit(X_shuf_train)
X_shuf_train_sc = scaler.transform(X_shuf_train)
X_shuf_test_sc = scaler.transform(X_shuf_test)

model = LogisticRegression()
model.fit(X_shuf_train_sc, y_shuf_train)
y_hat = model.predict(X_shuf_test_sc)

print(classification_report(y_shuf_test, y_hat))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2154
           1       0.84      0.66      0.74       346

    accuracy                           0.94      2500
   macro avg       0.89      0.82      0.85      2500
weighted avg       0.93      0.94      0.93      2500



## And do it properly

We should dummy encode these things instead.

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

numeric_features = [1, 2, 3]
numeric_transformer = make_pipeline(StandardScaler())

categorical_features = [0]
categorical_transformer = make_pipeline(OneHotEncoder(drop='first'))

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipe = make_pipeline(preprocessor, LogisticRegression())

Now try datasets

In [30]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2132
           1       0.88      0.79      0.83       368

    accuracy                           0.95      2500
   macro avg       0.92      0.88      0.90      2500
weighted avg       0.95      0.95      0.95      2500



In [31]:
pipe.fit(X_shuf_train, y_shuf_train)
y_hat = pipe.predict(X_shuf_test)

print(classification_report(y_shuf_test, y_hat))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2154
           1       0.87      0.73      0.80       346

    accuracy                           0.95      2500
   macro avg       0.92      0.86      0.88      2500
weighted avg       0.95      0.95      0.95      2500

