# Building pipelinines for ML

Proceso para realizar un Pipeline donde se hagan transformaciones y por ultimo se implemente un clasificador/regresor.

In [1]:
## for reproducibility
from random import seed as seed1
from numpy.random import seed as seed2
random_seed = 123
seed1(random_seed)
seed2(random_seed)

Cargar datos

In [2]:
def list_datasets():
    from seaborn import get_dataset_names, load_dataset
    for name in get_dataset_names():
        df = load_dataset(name=name)
        print(name, "*"*50)
        print(df.info())
        print("\n\n")
    return

# list_datasets()

In [3]:
def load_dataset(name:str):
    from seaborn import load_dataset
    df = load_dataset(name=name)
    print(f"{df.shape}")
    print(df.describe().T.to_string())
    return df

In [4]:
df = load_dataset(name='titanic')
df.head()

(891, 15)
          count       mean        std   min      25%      50%   75%       max
survived  891.0   0.383838   0.486592  0.00   0.0000   0.0000   1.0    1.0000
pclass    891.0   2.308642   0.836071  1.00   2.0000   3.0000   3.0    3.0000
age       714.0  29.699118  14.526497  0.42  20.1250  28.0000  38.0   80.0000
sibsp     891.0   0.523008   1.102743  0.00   0.0000   0.0000   1.0    8.0000
parch     891.0   0.381594   0.806057  0.00   0.0000   0.0000   0.0    6.0000
fare      891.0  32.204208  49.693429  0.00   7.9104  14.4542  31.0  512.3292


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


Definiendo variables de target

In [5]:
target, targetLabel = "survived", "alive"
print(f"target columns:\n\t{', '.join([target, targetLabel])}")
df.groupby([target, targetLabel], dropna=False).size()

target columns:
	survived, alive


survived  alive
0         no       549
1         yes      342
dtype: int64

Definiendo features

In [6]:
_cols_cat = list(df.drop([target, targetLabel],axis=1).select_dtypes(["object","category","string"]).columns)
_cols_num = [c for c in df if c not in [target, targetLabel]+_cols_cat]
print(f"categorical columns: {len(_cols_cat)}\n\t{', '.join(_cols_cat)}")
print(f"numeric columns: {len(_cols_num)}\n\t{', '.join(_cols_num)}")
features = [c for c in df if c in _cols_cat+_cols_num]
print(f"\n{features=}")

categorical columns: 6
	sex, embarked, class, who, deck, embark_town
numeric columns: 7
	pclass, age, sibsp, parch, fare, adult_male, alone

features=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alone']


Split in Train and Test

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X, y = df[features], df[target]
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2)
print(f"Train:\t{x_train.shape=}\t{y_train.shape=}\nTest:\t{x_test.shape=}\t{y_test.shape=}")

Train:	x_train.shape=(712, 13)	y_train.shape=(712,)
Test:	x_test.shape=(179, 13)	y_test.shape=(179,)


Pipeline

In [39]:
##visualize Pipeline
from sklearn import set_config
set_config(display="diagram")

from sklearn.pipeline import Pipeline

In [40]:
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

## classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [41]:
process_num = Pipeline(steps=[
                        ("impute_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
                        ("scale_minmax", MinMaxScaler([0,1]))
                    ]
                )

process_cat = Pipeline(steps=[
                        ("impute_constant", SimpleImputer(fill_value="missing", strategy="constant")),
                        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
                    ]
                )

preprocessor = ColumnTransformer(
                    [
                        ("categorical", process_cat, _cols_cat)
                        , ("numerical", process_num, _cols_num)
                    ]
            )

# pipe = Pipeline(
#     [
#         ("preprocessor", preprocessor)
#         , ("clf", LogisticRegression())
#     ]
# )


In [42]:
clfs_ = {"LR": LogisticRegression(), "KNN": KNeighborsClassifier(), "RF": RandomForestClassifier()}
pipes = dict()
for name,clf in clfs_.items():
    pipes[name] = Pipeline(
                        [
                            ("preprocessor", preprocessor)
                            , ("clf", clf)
                        ]
                    )


Fit with x_train, y_train

In [44]:
def get_report(y_true, y_pred):
    from sklearn.metrics import classification_report, mean_squared_error, r2_score, accuracy_score, roc_auc_score
    metrics = dict()
    for op in ["classification_report", "mean_squared_error", "r2_score", "accuracy_score", "roc_auc_score"]:
        if op=="classification_report":
            print(op,":\n", eval(f"{op}(y_true, y_pred)"))
            print("*"*50)
        else:
            metrics[op] = eval(f"{op}(y_true, y_pred)")
            print(f"{op}: {metrics[op]}")
    return metrics


In [50]:
# grid_search.fit(x_train, y_train)
for name,pipe in pipes.items():
    print(f"{name}", "*"*50)
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    metrics = get_report(y_true=y_test, y_pred=y_pred)
    print("\n")

LR **************************************************
classification_report :
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       114
           1       0.79      0.80      0.79        65

    accuracy                           0.85       179
   macro avg       0.84      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179

**************************************************
mean_squared_error: 0.15083798882681565
r2_score: 0.3477732793522268
accuracy_score: 0.8491620111731844
roc_auc_score: 0.8385964912280703


KNN **************************************************
classification_report :
               precision    recall  f1-score   support

           0       0.85      0.84      0.85       114
           1       0.73      0.74      0.73        65

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.81      0.80      0.80       1