In this notebook you will find other transformations for preprocessing.

In [56]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

In [4]:
df = pd.read_csv("data/quiz2-grade-toy-col-transformer.csv")
df

Unnamed: 0,enjoy_course,ml_experience,major,class_attendance,university_years,lab1,lab2,lab3,lab4,quiz1,quiz2
0,yes,1,Computer Science,Excellent,3,92,93.0,84,91,92,A+
1,yes,1,Mechanical Engineering,Average,2,94,90.0,80,83,91,not A+
2,yes,0,Mathematics,Poor,3,78,85.0,83,80,80,not A+
3,no,0,Mathematics,Excellent,3,91,,92,91,89,A+
4,yes,0,Psychology,Good,4,77,83.0,90,92,85,A+
5,no,1,Economics,Good,5,70,73.0,68,74,71,not A+
6,yes,1,Computer Science,Excellent,4,80,88.0,89,88,91,A+
7,no,0,Mechanical Engineering,Poor,3,95,93.0,69,79,75,not A+
8,no,0,Linguistics,Average,2,97,90.0,94,82,80,not A+
9,yes,1,Mathematics,Average,4,95,82.0,94,94,85,not A+


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   enjoy_course      21 non-null     object 
 1   ml_experience     21 non-null     int64  
 2   major             21 non-null     object 
 3   class_attendance  21 non-null     object 
 4   university_years  21 non-null     int64  
 5   lab1              21 non-null     int64  
 6   lab2              19 non-null     float64
 7   lab3              21 non-null     int64  
 8   lab4              21 non-null     int64  
 9   quiz1             21 non-null     int64  
 10  quiz2             21 non-null     object 
dtypes: float64(1), int64(6), object(4)
memory usage: 1.9+ KB


In [6]:
df.head()

Unnamed: 0,enjoy_course,ml_experience,major,class_attendance,university_years,lab1,lab2,lab3,lab4,quiz1,quiz2
0,yes,1,Computer Science,Excellent,3,92,93.0,84,91,92,A+
1,yes,1,Mechanical Engineering,Average,2,94,90.0,80,83,91,not A+
2,yes,0,Mathematics,Poor,3,78,85.0,83,80,80,not A+
3,no,0,Mathematics,Excellent,3,91,,92,91,89,A+
4,yes,0,Psychology,Good,4,77,83.0,90,92,85,A+


- Scaling on numeric features
- One-hot encoding on the categorical feature `major` and binary feature `enjoy_class`
- Ordinal encoding on the ordinal feature `class_attendance`
- Imputation on the `lab2` feature
- None on the `ml_experience` feature

### `ColumnTransformer` example

#### Data

In [7]:
X = df.drop(columns=["quiz2"])
y = df["quiz2"]
X.columns

Index(['enjoy_course', 'ml_experience', 'major', 'class_attendance',
       'university_years', 'lab1', 'lab2', 'lab3', 'lab4', 'quiz1'],
      dtype='object')

#### Identify the transformations we want to apply

In [8]:
X.head()

Unnamed: 0,enjoy_course,ml_experience,major,class_attendance,university_years,lab1,lab2,lab3,lab4,quiz1
0,yes,1,Computer Science,Excellent,3,92,93.0,84,91,92
1,yes,1,Mechanical Engineering,Average,2,94,90.0,80,83,91
2,yes,0,Mathematics,Poor,3,78,85.0,83,80,80
3,no,0,Mathematics,Excellent,3,91,,92,91,89
4,yes,0,Psychology,Good,4,77,83.0,90,92,85


In [9]:
numeric_feats = ["university_years", "lab1", "lab3", "lab4", "quiz1"]  # apply scaling
categorical_feats = ["major"]  # apply one-hot encoding
passthrough_feats = ["ml_experience"]  # do not apply any transformation
drop_feats = [
    "lab2",
    "class_attendance",
    "enjoy_course",
]  # do not include these features in modeling

#### Create a column transformer

- Each transformation is specified by a name, a transformer object, and the columns this transformer should be applied to.

In [10]:
from sklearn.compose import ColumnTransformer

In [15]:
ct = ColumnTransformer(
    [
        ("scaling", StandardScaler(), numeric_feats),
        ("onehot", OneHotEncoder(sparse_output=False), categorical_feats),
    ]
)

#### Convenient `make_column_transformer` syntax

- Similar to `make_pipeline` syntax, there is convenient `make_column_transformer` syntax.
- The syntax automatically names each step based on its class.
- We'll be mostly using this syntax.

In [16]:
from sklearn.compose import make_column_transformer

ct = make_column_transformer(
    (StandardScaler(), numeric_feats),  # scaling on numeric features
    ("passthrough", passthrough_feats),  # no transformations on the binary features
    (OneHotEncoder(), categorical_feats),  # OHE on categorical features
    ("drop", drop_feats),  # drop the drop features
)

In [17]:
ct

In [18]:
transformed = ct.fit_transform(X)

- When we `fit_transform`, each transformer is applied to the specified columns and the result of the transformations are concatenated horizontally.
- A big advantage here is that we build all our transformations together into one object, and that way we're sure we do the same operations to all splits of the data.
- Otherwise we might, for example, do the OHE on both train and test but forget to scale the test data.

#### Let's examine the transformed data

In [19]:
type(transformed[:2])

numpy.ndarray

In [20]:
transformed

array([[-0.09345386,  0.3589134 , -0.21733442,  0.36269995,  0.84002795,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [-1.07471942,  0.59082668, -0.61420598, -0.85597188,  0.71219761,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [-0.09345386, -1.26447953, -0.31655231, -1.31297381, -0.69393613,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ],
       [-0.09345386,  0.24295676,  0.57640869,  0.36269995,  0.45653693,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ],
       [ 0.8878117 , -1.38043616,  0.37797291,  0.51503393, -0.05478443,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.  

#### Viewing the transformed data as a dataframe

- How can we view our transformed data as a dataframe?
- We are adding more columns.
- So the original columns won't directly map to the transformed data.
- Let's create column names for the transformed data.

In [21]:
column_names = (
    numeric_feats
    + passthrough_feats
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
column_names

['university_years',
 'lab1',
 'lab3',
 'lab4',
 'quiz1',
 'ml_experience',
 'major_Biology',
 'major_Computer Science',
 'major_Economics',
 'major_Linguistics',
 'major_Mathematics',
 'major_Mechanical Engineering',
 'major_Physics',
 'major_Psychology']

In [22]:
ct.named_transformers_

{'standardscaler': StandardScaler(),
 'passthrough': FunctionTransformer(accept_sparse=True, check_inverse=False,
                     feature_names_out='one-to-one'),
 'onehotencoder': OneHotEncoder(),
 'drop': 'drop'}

```{note}
Note that the order of the columns in the transformed data depends upon the order of the features we pass to the `ColumnTransformer` and can be different than the order of the features in the original dataframe.
```

In [23]:
pd.DataFrame(transformed, columns=column_names)

Unnamed: 0,university_years,lab1,lab3,lab4,quiz1,ml_experience,major_Biology,major_Computer Science,major_Economics,major_Linguistics,major_Mathematics,major_Mechanical Engineering,major_Physics,major_Psychology
0,-0.093454,0.358913,-0.217334,0.3627,0.840028,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.074719,0.590827,-0.614206,-0.855972,0.712198,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.093454,-1.26448,-0.316552,-1.312974,-0.693936,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.093454,0.242957,0.576409,0.3627,0.456537,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.887812,-1.380436,0.377973,0.515034,-0.054784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,1.869077,-2.192133,-1.804821,-2.226978,-1.844409,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.887812,-1.032566,0.278755,-0.094302,0.712198,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.093454,0.706783,-1.705603,-1.465308,-1.333088,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,-1.074719,0.938697,0.774844,-1.008306,-0.693936,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.887812,0.706783,0.774844,0.819702,-0.054784,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### `ColumnTransformer`: Transformed data

<br>

<img src='./img/column-transformer.png' width="1500">

[Adapted from here.](https://amueller.github.io/COMS4995-s20/slides/aml-04-preprocessing/#37)

#### Training models with transformed data
- We can now pass the `ColumnTransformer` object as a step in a pipeline.

In [26]:
pipe = make_pipeline(ct, SVC())
pipe.fit(X, y)
pipe.predict(X)

array(['A+', 'not A+', 'not A+', 'A+', 'A+', 'not A+', 'A+', 'not A+',
       'not A+', 'A+', 'A+', 'A+', 'A+', 'A+', 'not A+', 'not A+', 'A+',
       'not A+', 'not A+', 'not A+', 'A+'], dtype=object)

In [27]:
pipe

## More on feature transformations

### `sklearn` `set_config`

- With multiple transformations in a column transformer, it can get tricky to keep track of everything happening inside it.
- We can use `set_config` to display a diagram of this.

In [28]:
from sklearn import set_config

set_config(display="diagram")

In [29]:
ct

In [30]:
print(ct)

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['university_years', 'lab1', 'lab3', 'lab4',
                                  'quiz1']),
                                ('passthrough', 'passthrough',
                                 ['ml_experience']),
                                ('onehotencoder', OneHotEncoder(), ['major']),
                                ('drop', 'drop',
                                 ['lab2', 'class_attendance', 'enjoy_course'])])


### Multiple transformations in a transformer

- Recall that `lab2` has missing values.


In [31]:
X.head(10)

Unnamed: 0,enjoy_course,ml_experience,major,class_attendance,university_years,lab1,lab2,lab3,lab4,quiz1
0,yes,1,Computer Science,Excellent,3,92,93.0,84,91,92
1,yes,1,Mechanical Engineering,Average,2,94,90.0,80,83,91
2,yes,0,Mathematics,Poor,3,78,85.0,83,80,80
3,no,0,Mathematics,Excellent,3,91,,92,91,89
4,yes,0,Psychology,Good,4,77,83.0,90,92,85
5,no,1,Economics,Good,5,70,73.0,68,74,71
6,yes,1,Computer Science,Excellent,4,80,88.0,89,88,91
7,no,0,Mechanical Engineering,Poor,3,95,93.0,69,79,75
8,no,0,Linguistics,Average,2,97,90.0,94,82,80
9,yes,1,Mathematics,Average,4,95,82.0,94,94,85


- So we would like to apply more than one transformations on it: imputation and scaling.
- We can treat `lab2` separately, but we can also include it into `numeric_feats` and apply both transformations on all numeric columns.

In [32]:
numeric_feats = [
    "university_years",
    "lab1",
    "lab2",
    "lab3",
    "lab4",
    "quiz1",
]  # apply scaling
categorical_feats = ["major"]  # apply one-hot encoding
passthrough_feats = ["ml_experience"]  # do not apply any transformation
drop_feats = ["class_attendance", "enjoy_course"]

- To apply more than one transformations we can define a pipeline inside a column transformer to chain different transformations.

In [35]:
ct = make_column_transformer(
    (
        make_pipeline(SimpleImputer(), StandardScaler()),
        numeric_feats,
    ),  # scaling on numeric features
    ("passthrough", passthrough_feats),  # no transformations on the binary features
    (OneHotEncoder(), categorical_feats),  # OHE on categorical features
    ("drop", drop_feats),  # drop the drop features
)

In [36]:
ct

In [37]:
X_transformed = ct.fit_transform(X)

In [38]:
column_names = (
    numeric_feats
    + passthrough_feats
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
column_names

['university_years',
 'lab1',
 'lab2',
 'lab3',
 'lab4',
 'quiz1',
 'ml_experience',
 'major_Biology',
 'major_Computer Science',
 'major_Economics',
 'major_Linguistics',
 'major_Mathematics',
 'major_Mechanical Engineering',
 'major_Physics',
 'major_Psychology']

In [39]:
pd.DataFrame(X_transformed, columns=column_names)

Unnamed: 0,university_years,lab1,lab2,lab3,lab4,quiz1,ml_experience,major_Biology,major_Computer Science,major_Economics,major_Linguistics,major_Mathematics,major_Mechanical Engineering,major_Physics,major_Psychology
0,-0.093454,0.358913,0.89326,-0.217334,0.3627,0.840028,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.074719,0.590827,0.294251,-0.614206,-0.855972,0.712198,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.093454,-1.26448,-0.704099,-0.316552,-1.312974,-0.693936,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.093454,0.242957,0.0,0.576409,0.3627,0.456537,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.887812,-1.380436,-1.103439,0.377973,0.515034,-0.054784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,1.869077,-2.192133,-3.100139,-1.804821,-2.226978,-1.844409,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.887812,-1.032566,-0.105089,0.278755,-0.094302,0.712198,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.093454,0.706783,0.89326,-1.705603,-1.465308,-1.333088,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,-1.074719,0.938697,0.294251,0.774844,-1.008306,-0.693936,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.887812,0.706783,-1.303109,0.774844,0.819702,-0.054784,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Incorporating ordinal feature `class_attendance`

- The `class_attendance` column is different than the `major` column in that there is some ordering of the values.
    - Excellent > Good > Average > poor

In [40]:
X.head()

Unnamed: 0,enjoy_course,ml_experience,major,class_attendance,university_years,lab1,lab2,lab3,lab4,quiz1
0,yes,1,Computer Science,Excellent,3,92,93.0,84,91,92
1,yes,1,Mechanical Engineering,Average,2,94,90.0,80,83,91
2,yes,0,Mathematics,Poor,3,78,85.0,83,80,80
3,no,0,Mathematics,Excellent,3,91,,92,91,89
4,yes,0,Psychology,Good,4,77,83.0,90,92,85


Let's try applying `OrdinalEncoder` on this column.

In [41]:
X_toy = X[["class_attendance"]]
enc = OrdinalEncoder()
enc.fit(X_toy)
X_toy_ord = enc.transform(X_toy)
df = pd.DataFrame(
    data=X_toy_ord,
    columns=["class_attendance_enc"],
    index=X_toy.index,
)

In [42]:
pd.concat([X_toy, df], axis=1).head(10)

Unnamed: 0,class_attendance,class_attendance_enc
0,Excellent,1.0
1,Average,0.0
2,Poor,3.0
3,Excellent,1.0
4,Good,2.0
5,Good,2.0
6,Excellent,1.0
7,Poor,3.0
8,Average,0.0
9,Average,0.0


- What's the problem here?
    - The encoder doesn't know the order.
- We can examine unique categories manually, order them based on our intuitions, and then provide this human knowledge to the transformer.

What are the unique categories of `class_attendance`?

In [43]:
X_toy["class_attendance"].unique()

array(['Excellent', 'Average', 'Poor', 'Good'], dtype=object)

Let's order them manually.

In [44]:
class_attendance_levels = ["Poor", "Average", "Good", "Excellent"]

```{note}
Note that if you use the reverse order of the categories, it wouldn't matter.
```

Let's make sure that we have included all categories in our manual ordering.

In [45]:
assert set(class_attendance_levels) == set(X_toy["class_attendance"].unique())

In [46]:
oe = OrdinalEncoder(categories=[class_attendance_levels], dtype=int)
oe.fit(X_toy[["class_attendance"]])
ca_transformed = oe.transform(X_toy[["class_attendance"]])
df = pd.DataFrame(
    data=ca_transformed, columns=["class_attendance_enc"], index=X_toy.index
)
print(oe.categories_)
pd.concat([X_toy, df], axis=1).head(10)

[array(['Poor', 'Average', 'Good', 'Excellent'], dtype=object)]


Unnamed: 0,class_attendance,class_attendance_enc
0,Excellent,3
1,Average,1
2,Poor,0
3,Excellent,3
4,Good,2
5,Good,2
6,Excellent,3
7,Poor,0
8,Average,1
9,Average,1


The encoded categories are looking better now!

#### More than one ordinal columns?

- We can pass the manually ordered categories when we create an `OrdinalEncoder` object as a list of lists.
- If you have more than one ordinal columns
    - manually create a list of ordered categories for each column
    - pass a list of lists to `OrdinalEncoder`, where each inner list corresponds to manually created list of ordered categories for a corresponding ordinal column.


Now let's incorporate ordinal encoding of `class_attendance` in our column transformer.

In [47]:
X

Unnamed: 0,enjoy_course,ml_experience,major,class_attendance,university_years,lab1,lab2,lab3,lab4,quiz1
0,yes,1,Computer Science,Excellent,3,92,93.0,84,91,92
1,yes,1,Mechanical Engineering,Average,2,94,90.0,80,83,91
2,yes,0,Mathematics,Poor,3,78,85.0,83,80,80
3,no,0,Mathematics,Excellent,3,91,,92,91,89
4,yes,0,Psychology,Good,4,77,83.0,90,92,85
5,no,1,Economics,Good,5,70,73.0,68,74,71
6,yes,1,Computer Science,Excellent,4,80,88.0,89,88,91
7,no,0,Mechanical Engineering,Poor,3,95,93.0,69,79,75
8,no,0,Linguistics,Average,2,97,90.0,94,82,80
9,yes,1,Mathematics,Average,4,95,82.0,94,94,85


In [48]:
numeric_feats = [
    "university_years",
    "lab1",
    "lab2",
    "lab3",
    "lab4",
    "quiz1",
]  # apply scaling
categorical_feats = ["major"]  # apply one-hot encoding
ordinal_feats = ["class_attendance"]  # apply ordinal encoding
passthrough_feats = ["ml_experience"]  # do not apply any transformation
drop_feats = ["enjoy_course"]  # do not include these features

In [49]:
ct = make_column_transformer(
    (
        make_pipeline(SimpleImputer(), StandardScaler()),
        numeric_feats,
    ),  # scaling on numeric features
    (
        OrdinalEncoder(categories=[class_attendance_levels], dtype=int),
        ordinal_feats,
    ),  # Ordinal encoding on ordinal features
    ("passthrough", passthrough_feats),  # no transformations on the binary features
    (OneHotEncoder(), categorical_feats),  # OHE on categorical features
    ("drop", drop_feats),  # drop the drop features
)

In [50]:
ct

In [51]:
X_transformed = ct.fit_transform(X)

In [52]:
column_names = (
    numeric_feats
    + ordinal_feats
    + passthrough_feats
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
column_names

['university_years',
 'lab1',
 'lab2',
 'lab3',
 'lab4',
 'quiz1',
 'class_attendance',
 'ml_experience',
 'major_Biology',
 'major_Computer Science',
 'major_Economics',
 'major_Linguistics',
 'major_Mathematics',
 'major_Mechanical Engineering',
 'major_Physics',
 'major_Psychology']

In [53]:
pd.DataFrame(X_transformed, columns=column_names)

Unnamed: 0,university_years,lab1,lab2,lab3,lab4,quiz1,class_attendance,ml_experience,major_Biology,major_Computer Science,major_Economics,major_Linguistics,major_Mathematics,major_Mechanical Engineering,major_Physics,major_Psychology
0,-0.093454,0.358913,0.89326,-0.217334,0.3627,0.840028,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.074719,0.590827,0.294251,-0.614206,-0.855972,0.712198,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.093454,-1.26448,-0.704099,-0.316552,-1.312974,-0.693936,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.093454,0.242957,0.0,0.576409,0.3627,0.456537,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.887812,-1.380436,-1.103439,0.377973,0.515034,-0.054784,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,1.869077,-2.192133,-3.100139,-1.804821,-2.226978,-1.844409,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.887812,-1.032566,-0.105089,0.278755,-0.094302,0.712198,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.093454,0.706783,0.89326,-1.705603,-1.465308,-1.333088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,-1.074719,0.938697,0.294251,0.774844,-1.008306,-0.693936,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.887812,0.706783,-1.303109,0.774844,0.819702,-0.054784,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Dealing with unknown categories

Let's create a pipeline with the column transformer and pass it to `cross_validate`.

In [54]:
pipe = make_pipeline(ct, SVC())

In [57]:
scores = cross_validate(pipe, X, y, return_train_score=True)
pd.DataFrame(scores)

Traceback (most recent call last):
  File "/Users/gabrielasoares/.local/share/virtualenvs/cda-m6uw-1w4/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gabrielasoares/.local/share/virtualenvs/cda-m6uw-1w4/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 415, in __call__
    return estimator.score(*args, **kwargs)
  File "/Users/gabrielasoares/.local/share/virtualenvs/cda-m6uw-1w4/lib/python3.9/site-packages/sklearn/pipeline.py", line 993, in score
    Xt = transform.transform(Xt)
  File "/Users/gabrielasoares/.local/share/virtualenvs/cda-m6uw-1w4/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/gabrielasoares/.local/share/virtualenvs/cda-m6uw-1w4/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 1014, in transform
    Xs = self._call_func_on_

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.01567,0.002804,1.0,0.9375
1,0.004265,0.002053,1.0,0.941176
2,0.003682,0.001873,0.5,1.0
3,0.00359,0.001777,0.75,0.941176
4,0.0035,0.042072,,1.0


- What's going on here??
- Let's look at the error message:
`ValueError: Found unknown categories ['Biology'] in column 0 during transform
`

In [58]:
X["major"].value_counts()

major
Computer Science          4
Mathematics               4
Mechanical Engineering    3
Psychology                3
Economics                 2
Linguistics               2
Physics                   2
Biology                   1
Name: count, dtype: int64

- There is only one instance of Biology.
- During cross-validation, this is getting put into the validation split.
- By default, `OneHotEncoder` throws an error because you might want to know about this.

Simplest fix:
- Pass `handle_unknown="ignore"` argument to `OneHotEncoder`
- It creates a row with all zeros.

In [59]:
ct = make_column_transformer(
    (
        make_pipeline(SimpleImputer(), StandardScaler()),
        numeric_feats,
    ),  # scaling on numeric features
    (
        OrdinalEncoder(categories=[class_attendance_levels], dtype=int),
        ordinal_feats,
    ),  # Ordinal encoding on ordinal features
    ("passthrough", passthrough_feats),  # no transformations on the binary features
    (
        OneHotEncoder(handle_unknown="ignore"),
        categorical_feats,
    ),  # OHE on categorical features
    ("drop", drop_feats),  # drop the drop features
)

In [60]:
ct

In [61]:
pipe = make_pipeline(ct, SVC())

In [62]:
scores = cross_validate(pipe, X, y, cv=5, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.007331,0.006623,1.0,0.9375
1,0.004217,0.002107,1.0,0.941176
2,0.004507,0.00275,0.5,1.0
3,0.008862,0.003792,0.75,0.941176
4,0.008322,0.00945,0.75,1.0


- With this approach, all unknown categories will be represented with all zeros and cross-validation is running OK now.

Ask yourself the following questions when you work with categorical variables
- Do you want this behaviour?
- Are you expecting to get many unknown categories? Do you want to be able to distinguish between them?

### Categorical features with only two possible categories

- Sometimes you have features with only two possible categories.
- If we apply `OheHotEncoder` on such columns, it'll create two columns, which seems wasteful, as we could represent all information in the column in just one column with say 0's and 1's with presence of absence of one of the categories.
- You can pass `drop="if_binary"` argument to `OneHotEncoder` in order to create only one column in such scenario.

In [63]:
X["enjoy_course"].head()

0    yes
1    yes
2    yes
3     no
4    yes
Name: enjoy_course, dtype: object

In [64]:
ohe_enc = OneHotEncoder(drop="if_binary", dtype=int, sparse_output=False)
ohe_enc.fit(X[["enjoy_course"]])
transformed = ohe_enc.transform(X[["enjoy_course"]])
df = pd.DataFrame(data=transformed, columns=["enjoy_course_enc"], index=X.index)
pd.concat([X[["enjoy_course"]], df], axis=1).head(10)

Unnamed: 0,enjoy_course,enjoy_course_enc
0,yes,1
1,yes,1
2,yes,1
3,no,0
4,yes,1
5,no,0
6,yes,1
7,no,0
8,no,0
9,yes,1


In [65]:
numeric_feats = [
    "university_years",
    "lab1",
    "lab2",
    "lab3",
    "lab4",
    "quiz1",
]  # apply scaling
categorical_feats = ["major"]  # apply one-hot encoding
ordinal_feats = ["class_attendance"]  # apply ordinal encoding
binary_feats = ["enjoy_course"]  # apply one-hot encoding with drop="if_binary"
passthrough_feats = ["ml_experience"]  # do not apply any transformation
drop_feats = []

In [66]:
ct = make_column_transformer(
    (
        make_pipeline(SimpleImputer(), StandardScaler()),
        numeric_feats,
    ),  # scaling on numeric features
    (
        OrdinalEncoder(categories=[class_attendance_levels], dtype=int),
        ordinal_feats,
    ),  # Ordinal encoding on ordinal features
    (
        OneHotEncoder(drop="if_binary", dtype=int),
        binary_feats,
    ),  # OHE on categorical features
    ("passthrough", passthrough_feats),  # no transformations on the binary features
    (
        OneHotEncoder(handle_unknown="ignore"),
        categorical_feats,
    ),  # OHE on categorical features
)

In [67]:
ct

In [68]:
pipe = make_pipeline(ct, SVC())

In [69]:
scores = cross_validate(pipe, X, y, cv=5, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.008055,0.002908,1.0,1.0
1,0.005196,0.003317,1.0,0.941176
2,0.004834,0.002542,0.5,1.0
3,0.005452,0.003579,1.0,0.941176
4,0.006211,0.004199,0.75,1.0
