In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import janitor as jn

In [5]:
import yellowbrick.features as ybf
import yellowbrick.regressor as ybr
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.linear_model import *
from sklearn.datasets import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.decomposition import *
from sklearn.cluster import *
from sklearn.base import *
from sklearn.pipeline import Pipeline

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Classfication Pipeline

In [15]:
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "Name",
            "Ticket",
            "Cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

class TitanicTransformer(BaseEstimator, TransformerMixin):
    
    def transform(self, X):
        X = tweak_titanic(X)
        X = X.drop(columns="Survived")
        return X
    
    def fit(self, X, y):
        return self

pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", IterativeImputer()),
        ("std", StandardScaler()),
        ("rf", RandomForestClassifier()),
    ]
)

In [13]:
df = pd.read_csv('../data/titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    df,
    df.Survived,
    test_size=0.3,
    random_state=42,
)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.8134328358208955

In [19]:
params = {
    "rf__max_features": [0.4, "auto"],
    "rf__n_estimators": [15, 200],
}
grid = GridSearchCV(pipe, cv=3, param_grid=params)
grid.fit(df, df.Survived)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('titan', TitanicTransformer()),
                                       ('impute', IterativeImputer()),
                                       ('std', StandardScaler()),
                                       ('rf', RandomForestClassifier())]),
             param_grid={'rf__max_features': [0.4, 'auto'],
                         'rf__n_estimators': [15, 200]})

In [20]:
grid.best_params_
pipe.set_params(**grid.best_params_)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.8134328358208955

In [21]:
roc_auc_score(y_test2, pipe.predict(X_test2))

0.7932518505766913

## Regression Pipeline

In [6]:
from sklearn.datasets import load_boston

b = load_boston()
bos_X = pd.DataFrame(b.data, columns=b.feature_names)
bos_y = b.target
bos_X_train, bos_X_test, bos_y_train, bos_y_test = train_test_split(
    bos_X,
    bos_y,
    test_size=0.3,
    random_state=42,
)
bos_sX = StandardScaler().fit_transform(bos_X)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = train_test_split(
    bos_sX,
    bos_y,
    test_size=0.3,
    random_state=42,
)

In [7]:
reg_pipe = Pipeline(
    [
        ("std", StandardScaler()),
        ("lr", LinearRegression()),
    ]
)
reg_pipe.fit(bos_X_train, bos_y_train)
reg_pipe.score(bos_X_test, bos_y_test)

0.7112260057484934

In [8]:
reg_pipe.named_steps["lr"].intercept_
reg_pipe.named_steps["lr"].coef_

array([-1.10834602,  0.80843998,  0.34313466,  0.81386426, -1.79804295,
        2.913858  , -0.29893918, -2.94251148,  2.09419303, -1.44706731,
       -2.05232232,  1.02375187, -3.88579002])

In [9]:
mean_squared_error(bos_y_test, reg_pipe.predict(bos_X_test))

21.517444231177194

## PCA Pipeline

In [26]:
from sklearn.decomposition import PCA

pca_pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", IterativeImputer()),
        ("std", StandardScaler()),
        ("pca", PCA()),
    ]
)

X_pca = pca_pipe.fit_transform(df, df.Survived)

In [27]:
pca_pipe.named_steps["pca"].explained_variance_ratio_
pca_pipe.named_steps["pca"].components_[0]

array([-0.07962469,  0.20777443, -0.5163684 ,  0.56809914,  0.52205988,
        0.09708877, -0.27408641, -0.01339046,  0.06197844])