Base de dados do seguinte desafio:

https://www.kaggle.com/competitions/spaceship-titanic/overview

In [17]:
import pandas as pd
import time

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv("train.csv")

X = df.drop(['Transported', 'PassengerId'], axis=1)
y = df['Transported']

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [colname for colname in X_train.columns if X_train[colname].nunique() < 10 and X_train[colname].dtype == "object"]

numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_t = X_train[my_cols].copy()
X_v = X_val[my_cols].copy()

numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('scaling', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, categorical_cols)])

In [21]:
classifiers_list = [
    KNeighborsClassifier(4),
    SVC(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    xgb.XGBClassifier(eval_metric = 'logloss')
]

In [22]:
for classifier in classifiers_list:
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])
    a = time.time()
    pipe.fit(X_t, y_train)
    y_v_pred = pipe.predict(X_v)
    b = time.time()
    print(classifier)
    cm = metrics.confusion_matrix(y_val, y_v_pred)
    print("Confusion Matrix: \n", cm)
    print("Accuracy:",metrics.accuracy_score(y_val, y_v_pred))
    print("Sensitivity:", cm[0][0]/(cm[0][0] + cm[0][1]))
    print("Specificity:", cm[1][1]/(cm[1][1] + cm[1][0]))
    print("Proc. Time: {} ".format(b - a))
    print("\n ###################---###################")


KNeighborsClassifier(n_neighbors=4)
Confusion Matrix: 
 [[698 165]
 [264 612]]
Accuracy: 0.753306497987349
Sensitivity: 0.8088064889918888
Specificity: 0.6986301369863014
Proc. Time: 0.26405858993530273 

 ###################---###################
SVC()
Confusion Matrix: 
 [[668 195]
 [197 679]]
Accuracy: 0.7745830937320299
Sensitivity: 0.7740440324449595
Specificity: 0.7751141552511416
Proc. Time: 1.9324383735656738 

 ###################---###################
LogisticRegression()
Confusion Matrix: 
 [[657 206]
 [182 694]]
Accuracy: 0.7768832662449684
Sensitivity: 0.761297798377752
Specificity: 0.7922374429223744
Proc. Time: 0.031007051467895508 

 ###################---###################
DecisionTreeClassifier()
Confusion Matrix: 
 [[557 306]
 [182 694]]
Accuracy: 0.7193789534215066
Sensitivity: 0.645422943221321
Specificity: 0.7922374429223744
Proc. Time: 0.04001784324645996 

 ###################---###################
RandomForestClassifier()
Confusion Matrix: 
 [[652 211]
 [179 6