In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [21]:
data = "data/train.csv"

df = pd.read_csv(data)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
X = df.drop(["Name","Survived","PassengerId"], axis=1)
y = df["Survived"] #target

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 9), (179, 9), (712,), (179,))

In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       572 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Ticket    712 non-null    object 
 6   Fare      712 non-null    float64
 7   Cabin     159 non-null    object 
 8   Embarked  710 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 55.6+ KB


In [41]:
# Identificando colunas categorias e numericas

categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype  in ["int64", "float64"]]

categorical_cols, numerical_cols

(['Sex', 'Ticket', 'Cabin', 'Embarked'],
 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])

In [42]:
# Transformações de colunas numéricas 
# Criando pipeline de transformação
# 01 -Imputação de dados faltantes 
# 02 -scaling com StardarScaler

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
numerical_transformer

In [43]:
# Transformações de colunas categoricas

categorical_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
categorical_transform

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transform, categorical_cols)
    ]
)
preprocessor

In [45]:
pipeline_logreg = Pipeline(steps=[
    ("preprocessor", preprocessor), # pré-processamento
    ("model", LogisticRegression(max_iter=300, random_state=42)) # modelo preditivo
])
pipeline_logreg

In [46]:
pipeline_histgrad = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingClassifier(random_state=42))
])
pipeline_histgrad

In [47]:
# validação cruzada
scores_logreg = cross_val_score(pipeline_logreg, X_train, y_train, cv=5, scoring="accuracy")
scores_histgrad = cross_val_score(pipeline_histgrad, X_train, y_train, cv=5, scoring="accuracy")

scores_logreg.mean(), scores_histgrad.mean()

(0.8061459667093469, 0.8160543681670441)

In [48]:
best_pipeline = pipeline_logreg if scores_logreg.mean() > scores_histgrad.mean() else pipeline_histgrad
best_pipeline

In [49]:
best_pipeline.fit(X_train, y_train)

In [50]:
predictions = best_pipeline.predict(X_test)
predictions

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1])

In [51]:
accuracy_score(y_test, predictions)

0.8100558659217877

In [52]:
import pickle

with open("models/histgrad-titanic.pickle", "wb") as model_file:
    pickle.dump(best_pipeline, model_file)