# Applying ColumnTransformer to mixed column data types, and using pipelines

Reference: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

We could handle transformations directly when training the model. With ```ColumnTransform``` we can apply ```OneHotEncoder``` to categorical columns and ```RobustScaler``` (which is more rebust to outliers than other transformations) to numerical columns. Let's go back to the original dataset to practice this.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data_prep/hot_plus_random.csv')

In [3]:
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,success
0,0.565,0.547,0.0,-7.722,1.0,0.0347,0.579,0.0,0.194,0.252,75.018,audio_features,1MOqMyQ7CULmWWjovkFY5B,spotify:track:1MOqMyQ7CULmWWjovkFY5B,https://api.spotify.com/v1/tracks/1MOqMyQ7CULm...,https://api.spotify.com/v1/audio-analysis/1MOq...,209320.0,4.0,1.0
1,0.578,0.894,10.0,-5.42,1.0,0.0296,0.0103,3e-06,0.216,0.741,165.98,audio_features,2dwhMQsFeHr2S787WxqAqW,spotify:track:2dwhMQsFeHr2S787WxqAqW,https://api.spotify.com/v1/tracks/2dwhMQsFeHr2...,https://api.spotify.com/v1/audio-analysis/2dwh...,347107.0,4.0,1.0
2,0.529,0.496,7.0,-9.007,1.0,0.029,0.173,0.0,0.251,0.278,136.859,audio_features,3y4LxiYMgDl4RethdzpmNe,spotify:track:3y4LxiYMgDl4RethdzpmNe,https://api.spotify.com/v1/tracks/3y4LxiYMgDl4...,https://api.spotify.com/v1/audio-analysis/3y4L...,250547.0,4.0,1.0
3,0.488,0.923,2.0,-3.697,1.0,0.103,0.129,0.0,0.158,0.818,183.891,audio_features,296XGtH5MeGisqD3uAz6Q6,spotify:track:296XGtH5MeGisqD3uAz6Q6,https://api.spotify.com/v1/tracks/296XGtH5MeGi...,https://api.spotify.com/v1/audio-analysis/296X...,202253.0,4.0,1.0
4,0.753,0.45,9.0,-6.909,1.0,0.0924,0.274,2e-06,0.321,0.56,109.405,audio_features,6kD36kVRn5leDDbjXpHQY0,spotify:track:6kD36kVRn5leDDbjXpHQY0,https://api.spotify.com/v1/tracks/6kD36kVRn5le...,https://api.spotify.com/v1/audio-analysis/6kD3...,288933.0,4.0,1.0


In [4]:
df.shape

(15714, 19)

In [5]:
df.dtypes

danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
id                   object
uri                  object
track_href           object
analysis_url         object
duration_ms         float64
time_signature      float64
success             float64
dtype: object

In [6]:
df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'success'],
      dtype='object')

# Logistic Regression

### Using StandardScaler and OneHotEncoder (dropping the first column to avoid collinearity in logistic reg.)

In [44]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']
numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='error'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("accuracy score: %.3f" % clf.score(X_test, y_test))
print("cross_val score: %.3f" % cross_val_score(clf,X,y,cv=5,scoring="accuracy", n_jobs=-1).mean())
print("\n", classification_report(y_test,clf.predict(X_test)))

accuracy score: 0.885
cross_val score: 0.880

               precision    recall  f1-score   support

         0.0       0.93      0.82      0.88      1536
         1.0       0.85      0.95      0.89      1607

    accuracy                           0.89      3143
   macro avg       0.89      0.88      0.88      3143
weighted avg       0.89      0.89      0.88      3143



### Using StandardScaler and OneHotEncoder (all columns)

In [45]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']
numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='error'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("accuracy score: %.3f" % clf.score(X_test, y_test))
print("cross_val score: %.3f" % cross_val_score(clf,X,y,cv=5,scoring="accuracy", n_jobs=-1).mean())
print("\n", classification_report(y_test,clf.predict(X_test)))

accuracy score: 0.876
cross_val score: 0.880

               precision    recall  f1-score   support

         0.0       0.92      0.82      0.87      1551
         1.0       0.84      0.93      0.88      1592

    accuracy                           0.88      3143
   macro avg       0.88      0.87      0.88      3143
weighted avg       0.88      0.88      0.88      3143



### RobustScaler and OneHotEncoder (dropping the first column to avoid collinearity)

In [46]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']
numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='error'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("accuracy score: %.3f" % clf.score(X_test, y_test))
print("\n", classification_report(y_test,clf.predict(X_test)))

accuracy score: 0.886

               precision    recall  f1-score   support

         0.0       0.93      0.83      0.88      1545
         1.0       0.85      0.94      0.89      1598

    accuracy                           0.89      3143
   macro avg       0.89      0.89      0.89      3143
weighted avg       0.89      0.89      0.89      3143



### Using RobustScaler (which handles better the outliers) and OneHotEncoder() maintaining all columns

In [47]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']

numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("accuracy score: %.3f" % clf.score(X_test, y_test))
print("cross_val score: %.3f" % cross_val_score(clf,X,y,cv=5,scoring="accuracy", n_jobs=-1).mean())
print("\n", classification_report(y_test,clf.predict(X_test)))

accuracy score: 0.879
cross_val score: 0.880

               precision    recall  f1-score   support

         0.0       0.93      0.82      0.87      1570
         1.0       0.84      0.94      0.89      1573

    accuracy                           0.88      3143
   macro avg       0.88      0.88      0.88      3143
weighted avg       0.88      0.88      0.88      3143



### For testing purposes: using OrdinalEncoding on categorical features (instead of OneHotEncoding) and StandardScaler in the rest

In [49]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']
numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('ordinal', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("accuracy score: %.3f" % clf.score(X_test, y_test))
print("cross_val score: %.3f" % cross_val_score(clf,X,y,cv=5,scoring="accuracy", n_jobs=-1).mean())
print("\n", classification_report(y_test,clf.predict(X_test)))

accuracy score: 0.884
cross_val score: 0.879

               precision    recall  f1-score   support

         0.0       0.93      0.83      0.88      1586
         1.0       0.84      0.94      0.89      1557

    accuracy                           0.88      3143
   macro avg       0.89      0.88      0.88      3143
weighted avg       0.89      0.88      0.88      3143



# K-nearest neighbors Classifier

### RobustScaler and OneHotEncoder (dropping the first column to avoid collinearity)

In [54]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']
numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='error'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier(n_neighbors=15, n_jobs=-1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("accuracy score: %.3f" % clf.score(X_test, y_test))
print("cross_val score: %.3f" % cross_val_score(clf,X,y,cv=5,scoring="accuracy", n_jobs=-1).mean())
print("\n", classification_report(y_test,clf.predict(X_test)))

accuracy score: 0.877
cross_val score: 0.878

               precision    recall  f1-score   support

         0.0       0.94      0.81      0.87      1623
         1.0       0.82      0.95      0.88      1520

    accuracy                           0.88      3143
   macro avg       0.88      0.88      0.88      3143
weighted avg       0.89      0.88      0.88      3143



### Using pipelines with GridSearchCV

Reference: https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

In [57]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

X = df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'success'], axis=1)
y = df['success']
numeric_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = ['key', 'mode', 'time_signature']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='error'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


param_grid = {'classifier__n_neighbors': np.arange(4,100)}


CV = GridSearchCV(clf, param_grid, n_jobs=-1)

CV.fit(X_train, y_train)
print(CV.best_params_)    
print(CV.best_score_)

print("accuracy score: %.3f" % CV.score(X_test, y_test))
print("cross_val score: %.3f" % cross_val_score(CV,X,y,cv=5,scoring="accuracy", n_jobs=-1).mean())
print("\n", classification_report(y_test,CV.predict(X_test)))

{'classifier__n_neighbors': 16}
0.8788482470333132
accuracy score: 0.883
cross_val score: 0.879

               precision    recall  f1-score   support

         0.0       0.95      0.81      0.88      1597
         1.0       0.83      0.95      0.89      1546

    accuracy                           0.88      3143
   macro avg       0.89      0.88      0.88      3143
weighted avg       0.89      0.88      0.88      3143

