**Atualizando a biblioteca para plotagem de gráficos**


In [None]:
%pip -q install plotly --upgrade

**Importando bibliotecas**

In [234]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

**Abrindo um arquivo CSV do drive**

In [235]:
base = pd.read_csv('train.csv', sep=',')

**Você também pode carregar seu arquivo e já selecionar as colunas que desejar... investigue esta função**

In [236]:
base

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


**Contando quantidade de instâncias**


In [None]:
Classificação = base.columns[1]
np.unique(base[Classificação], return_counts=True)

In [None]:
sns.countplot(x = base[Classificação]);

Seprando classes relevantes


In [None]:
base_encoded = base.drop(columns=['PassengerId','Name', 'Ticket','Fare','Cabin','Embarked'])
base_encoded.head()

**Tratamento de dados categóricos**

> *LabelEncoder - Vamos tratar os dados categóricos colocando 1, 2, 3 e etc**



In [240]:
from sklearn.preprocessing import LabelEncoder

In [241]:
#para codificar todos os atributos para laberEncoder de uma única vez
#base_encoded = base.apply(LabelEncoder().fit_transform)
encoder = LabelEncoder()

# Aplicando no atributo 'Sex'
base_encoded['Sex'] = encoder.fit_transform(base_encoded['Sex'])


Definindo faixas de valores para idade

In [None]:
base_encoded['Age'].fillna(base_encoded['Age'].median(), inplace=True)
faixas_etarias = [0, 12, 18, 64, float('inf')]
rotulos = [0, 1, 2, 3]
# Criar a nova coluna com as faixas etárias
base_encoded['FaixaEtaria'] = pd.cut(base_encoded['Age'], bins=faixas_etarias, labels=rotulos, right=False)
base_encoded = base_encoded.drop(columns=['Age'])
# Exibir o DataFrame com a nova coluna
base_encoded.head()

In [None]:
base_encoded

Juntando as colunas SibSp e Parch através da soma e mudando para tamanho da familia, e definindo faixa de valores

In [None]:
base_encoded['FamilySize'] = base_encoded['SibSp'] + base_encoded['Parch'] + 1
base_encoded = base_encoded.drop(columns=['SibSp','Parch'])
qtd_familiares = [0, 4, 8, 12, float('inf')]
rotulos = [0, 1, 2, 3]
# Criar a nova coluna com as faixas etárias
base_encoded['FamilySize'] = pd.cut(base_encoded['FamilySize'], bins=qtd_familiares, labels=rotulos, right=False)
base_encoded

**Separar o dataset em variáveis independentes (X_prev) e dependentes (y_classe)**

In [258]:
y_classe = base_encoded.iloc[:,0]
X_prev = base_encoded.drop(columns=['Survived'])

In [259]:
from sklearn.model_selection import train_test_split

In [260]:
X_prev

Unnamed: 0,Pclass,Sex,FaixaEtaria,FamilySize
0,3,1,2,0
1,1,0,2,0
2,3,0,2,0
3,1,0,2,0
4,3,1,2,0
...,...,...,...,...
886,2,1,2,0
887,1,0,2,0
888,3,0,2,1
889,1,1,2,0


In [261]:
y_classe

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [262]:
y_classe.shape

(891,)

In [None]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X_prev, y_classe, test_size = 0.20, random_state = 42)

In [264]:
X_treino.shape

(712, 4)

In [265]:
X_teste.shape

(179, 4)

In [266]:
X_teste

Unnamed: 0,Pclass,Sex,FaixaEtaria,FamilySize
709,3,1,2,0
439,2,1,2,0
840,3,1,2,0
720,2,0,0,0
39,3,0,1,0
...,...,...,...,...
433,3,1,1,0
773,3,1,2,0
25,3,0,2,1
84,2,0,1,0


In [267]:
X_treino

Unnamed: 0,Pclass,Sex,FaixaEtaria,FamilySize
331,1,1,2,0
733,2,1,2,0
382,3,1,2,0
704,3,1,2,0
813,3,0,0,1
...,...,...,...,...
106,3,0,2,0
270,1,1,2,0
860,3,1,2,0
435,1,0,1,1


In [268]:
y_teste

Unnamed: 0,Survived
709,1
439,0
840,0
720,1
39,1
...,...
433,0
773,0
25,1
84,1


Transformando os dados tratados em arquivo pkl

In [269]:
import pickle

In [270]:
with open('/content/sample_data/Titanic_t5.pkl', mode = 'wb') as f:
  pickle.dump([X_treino, X_teste, y_treino, y_teste], f)