# Various Families

In [1]:
!pip install yellowbrick --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [3]:
df = pd.read_csv('titanic.csv')
df.sample(n=5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
801,3,0,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q,,,
467,2,1,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24.0,1,0,244367,26.0,,S,12.0,,"Moscow / Bronx, NY"
1217,3,0,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S,,,
1269,3,0,"Vande Velde, Mr. Johannes Joseph",male,33.0,0,0,345780,9.5,,S,,,
294,1,0,"Thayer, Mr. John Borland",male,49.0,1,1,17421,110.8833,C68,C,,,"Haverford, PA"


In [4]:
from sklearn import (
    metrics,
    model_selection,
)
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, OrdinalEncoder
)
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

# diferentiate features matrix (X) and label target (y)
df = df.drop(columns=['name', 'ticket', 'home.dest', 'boat', 'body'])
X, y = df.drop(columns=['survived']), df['survived']

def tweaking(df, norm=True, hot=True, model=DummyClassifier(), cross=True):
  
  # spliting samples on train and test samples
  X_train, X_test, y_train, y_test = model_selection.train_test_split(
      X,
      y,
      test_size=.3,
      random_state= 42
  )
  # separate numerical and categorical features
  numerical = selector(dtype_include=np.number)(X)
  categorical = selector(dtype_include=object)(X)

  # Preprocessing features

  # numerical features: imputing and scaling
  imputer = impute.IterativeImputer()
  scaler = StandardScaler()

  if norm:
    num_preprocessor = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler)
    ])
  else:
    num_preprocessor = Pipeline([
        ('imputer', imputer),
    ])

  # categorical features
  one_hot_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse=False)
  ordinal_preprocessor = OrdinalEncoder(
      handle_unknown='use_encoded_value', 
      unknown_value=-1
  )

  if cross:
  
    if hot:
      preprocessor = ColumnTransformer([
          ('num', num_preprocessor, numerical),
          ('cat_onehot', one_hot_preprocessor, categorical)
      ])
    
    else:
      preprocessor = ColumnTransformer([
        ('num', num_preprocessor, numerical),
        ('cat_ordinal', ordinal_preprocessor, categorical)
    ])
    
    estimator = make_pipeline(preprocessor, model)
    return estimator
  
  else:
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)

    num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']

    X_train.loc[:, num_cols] = imputer.fit_transform(X_train[num_cols])
    X_test.loc[:, num_cols] = imputer.transform(X_test[num_cols])

    if norm:
      X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
      X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])

    return X_train, y_train, X_test, y_test

In [5]:
X_train, y_train, X_test, y_test = tweaking(df, cross=False)

bm = DummyClassifier()
bm.fit(X_train, y_train)
y_pred = bm.predict(X_test)
acc = bm.score(X_test, y_test)

In [6]:
acc.round(3)

0.57

In [7]:
metrics.precision_score(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [8]:
metrics.recall_score(y_test, y_pred)

0.0

# Various Family

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
    RandomForestClassifier,
    HistGradientBoostingClassifier,
)
import xgboost

models = [
    LinearRegression, 
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    RandomForestClassifier,
    HistGradientBoostingClassifier,
    xgboost.XGBClassifier,
]

names = []
results = []
for cls in models:
  estimator = tweaking(df, model=cls(), cross=True)

  for idx, itm in enumerate(estimator.get_params().keys()):
    if idx == 4:
      names.append(itm)

  kfold = model_selection.KFold(
      n_splits=10,
      shuffle=True,
      random_state=42,
  )

  cv_results = model_selection.cross_validate(
      estimator,
      X,
      y,
      cv=kfold,
      scoring='roc_auc',
      n_jobs=-1,
  )
  results.append((cv_results['test_score'].mean(), cv_results['test_score'].std()))
# cv_results = pd.DataFrame(cv_results)
for n, res in zip(names, results):
  print(f"{n:22} AUC: "
          f"{res[0].mean():.3f} STD: {res[1].std():.2f}"
      )


linearregression       AUC: 0.772 STD: 0.00
decisiontreeclassifier AUC: 0.768 STD: 0.00
kneighborsclassifier   AUC: 0.834 STD: 0.00
gaussiannb             AUC: 0.775 STD: 0.00
randomforestclassifier AUC: 0.854 STD: 0.00
histgradientboostingclassifier AUC: 0.859 STD: 0.00
xgbclassifier          AUC: 0.863 STD: 0.00
