<a href="https://colab.research.google.com/github/gugasth/Titanic/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Gustavo Paulo
[My github](github.com/gugasth)
Contact: gustavo.p07@aluno.ifsc.edu.br

In [None]:
#@title Importing libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#@title Reading, opening and viewing the files
train_data = pd.read_csv("https://raw.githubusercontent.com/gugasth/Titanic/main/Dados/train.csv")
test_data = pd.read_csv("https://raw.githubusercontent.com/gugasth/Titanic/main/Dados/test.csv")

display(train_data.head())
display(test_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
#@title Checking the amount of null values in each column
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
#@title Checking the type of features
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
#@title Defining X and y (features and target)
numeric_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categoric_columns = ['Sex', 'Embarked']

my_columns = categoric_columns + numeric_columns

X = train_data[my_columns].copy()
y = train_data['Survived']

final_test = test_data[my_columns].copy()

In [None]:
#@title Preprocessor
#@markdown Here we are going to split the data into 80% for training and 20% for testing so that we can use AutoML to find the best possible model for our dataset. So when we find that model, we'll train it on 100% data.
%pip install tpot
from tpot import TPOTClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Defina o pré-processador
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_columns),
    ('cat', categorical_transformer, categoric_columns)
])

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    random_state=42,
    config_dict='TPOT sparse',
    warm_start=True,
    scoring='accuracy'
)


X_train_preprocessed = preprocessor.fit_transform(X_train)


tpot.fit(X_train_preprocessed, y_train)


X_test_preprocessed = preprocessor.transform(X_test)
accuracy = tpot.score(X_test_preprocessed, y_test)


print("Precisão do modelo otimizado:", accuracy)


Collecting tpot
  Downloading TPOT-0.12.1-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11939 sha256=070108d4694d555c6c6348a63f04c3a73d650855b8a5c09fee465460d8a38526
  Stored in directory: /ro

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8230276765488032

Generation 2 - Current best internal CV score: 0.8230276765488032

Generation 3 - Current best internal CV score: 0.8230276765488032

Generation 4 - Current best internal CV score: 0.8328671328671329

Generation 5 - Current best internal CV score: 0.8328671328671329

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=8, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.7500000000000001, verbosity=0)
Precisão do modelo otimizado: 0.8547486033519553


Here we need to read the tpot output and create this model

In [None]:
#@title Define model
titanic_model = XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=100, n_jobs=1, subsample=0.6000000000000001, verbosity=0)

In [None]:
#@title Model fit
model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', titanic_model)
                              ])


In [None]:
#@title Training the model
model.fit(X, y)

In [None]:
#@title Making the final predict
predictions = model.predict(final_test)

In [None]:
#@title Saving the test predictions to file
output = pd.DataFrame({'PassengerId': test_data['PassengerId'],
                       'Survived': predictions})
output.to_csv('submission.csv', index=False)