### Ejercicio elegir pipeline en clasficacion
https://towardsdatascience.com/simple-way-to-find-a-suitable-algorithm-for-your-data-in-scikit-learn-python-9a9710c7c0fe

In [4]:
# Set seed
seed = 8

# Data manipulation
import numpy as np
import pandas as pd
from seaborn import load_dataset

# Machine learning pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# from xgboost.sklearn import XGBClassifier

# Import data
exclude = ['pclass', 'embarked', 'who', 'adult_male', 'alive', 'alone']
df = load_dataset('titanic').drop(columns=exclude)

# Inspect shape of the data and top rows
print(f"{df.shape[0]} rows, {df.shape[1]} columns")
df.head()

891 rows, 9 columns


Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,deck,embark_town
0,0,male,22.0,1,0,7.25,Third,,Southampton
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,,Southampton
3,1,female,35.0,1,0,53.1,First,C,Southampton
4,0,male,35.0,0,0,8.05,Third,,Southampton


In [5]:
# Set target
target = 'survived'
features = df.drop(columns=target).columns

# Split data into train & test
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=.2, random_state=seed, 
                                                    stratify=df[target])

# Inspect data
print(f"Training data ({X_train.shape[0]} rows): Target distribution")
print(y_train.value_counts(normalize=True))
print(f"\nTest data ({X_test.shape[0]} rows): Target distribution")
print(y_train.value_counts(normalize=True))

# Define feature groups
numerical = X_train.select_dtypes(['number']).columns
print(f'\nNumerical: {numerical}')
categorical = X_train.columns.difference(numerical)
X_train[categorical] = X_train[categorical].astype('object')
print(f'Categorical: {categorical}')

Training data (712 rows): Target distribution
0    0.616573
1    0.383427
Name: survived, dtype: float64

Test data (179 rows): Target distribution
0    0.616573
1    0.383427
Name: survived, dtype: float64

Numerical: Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')
Categorical: Index(['class', 'deck', 'embark_town', 'sex'], dtype='object')


In [6]:
class Imputer(BaseEstimator, TransformerMixin):
    """A custom transformer that imputes with a constant value in place.
    
    Parameters
    ----------
    value: (optional) A value to impute with
    """
    def __init__(self, value='missing'):
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.fillna(self.value, inplace=True)
        return X
    
class CardinalityReducer(BaseEstimator, TransformerMixin):
    """A custom transformer that encodes infrequent labels into 'other' in place.
    
    Parameters
    ----------
    threshold: (optional) An integer for minimum threshold frequency count or 
    a float for threshold of frequency proportion to keep the category. If 
    category frequency doesn't surpass the threshold, its value will be 
    overwritten with 'other'.  
    """
    def __init__(self, threshold=.01):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.top_categories = {}
        for feature in X.columns:
            frequencies = pd.Series(X[feature].value_counts(normalize=True))
            if isinstance(self.threshold, int):
                top_categories = frequencies.head(self.threshold).index
            elif isinstance(self.threshold, float):   
                top_categories = frequencies[frequencies>self.threshold].index
            self.top_categories[feature] = list(top_categories)
        return self

    def transform(self, X):
        for feature in X.columns:
            X[feature] = np.where(X[feature].isin(self.top_categories[feature]), 
                                  X[feature], 'other')
        return X


In [7]:
# Build preprocessing pipeline
categorical_pipe = Pipeline([('imputer', Imputer()),
                             ('cardinality_reducer', CardinalityReducer()),
                             ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                           ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(transformers=[('cat', categorical_pipe, categorical),
                                               ('num', numerical_pipe, numerical)])
# Fit and transform training data
preprocessor.fit(X_train)
cat = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical)
columns = np.append(cat, numerical)
X_train_transformed = pd.DataFrame(preprocessor.transform(X_train), columns=columns)
X_train_transformed.head()


Unnamed: 0,class_First,class_Second,class_Third,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_missing,...,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_other,sex_female,sex_male,age,sibsp,parch,fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,0.031425
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.363052,0.0,0.0,0.013565
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,0.016461
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.363052,0.0,0.0,0.015835
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.22091,0.125,0.0,0.034743
