# Esse notebook é uma continuação do Competição Kaggle - Titanic - Data Understanding

## Dentro desse segundo caderno, vamos começar a desenvolver o modelo de machine learning e criar o dataframe para submissão do desafio

In [1]:
#Criação e manipulação dos dataframes
import pandas as pd 

#Operações matemáticas
import numpy as np 

#Técnica de dimensionalidade 
from sklearn.decomposition import PCA

#Biblioteca de normalização
from sklearn.preprocessing import StandardScaler, RobustScaler

#Biblioteca de clusterização
from sklearn.cluster import KMeans, MeanShift

#Separar os dataframes em treino e teste
from sklearn.model_selection import train_test_split

## Importando os conjuntos de dados

In [2]:
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')

#Criando uma lista com os dataframes para simplificar fazer as mesmas ações em ambos 
datasets = [df_train, df_test]

In [11]:
display(df_train)
display(df_test)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,27.0,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,27.0,0,0,359309,8.0500,,S


## Processando os dados

In [4]:
for df in datasets:
    #Criando a nova coluna somando os irmãos/conjugues e pais/filhos
    df['Tamanho_Familia'] = df['SibSp'] + df['Parch']
    
    #Criando uma nova coluna para verificar quem estava sozinho
    df['Sozinho'] = df['Tamanho_Familia'].apply(lambda x: 1 if x == 0 else 0)
    
    #Completando os valores de embarque com a moda da coluna
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    #Completando os valores de idade e preço com a mediana da coluna
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    #Transformando as colunas categóricas em numéricas
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
    
    #Removendo as colunas que não serão necessárias
    #A coluna Cabine será removida por conter muitos valores nulos
    df.drop(columns=['PassengerId', 'Cabin', 'Fare', 'Name', 'Ticket'], inplace=True)

In [10]:
df_train = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

KeyError: "None of [Index(['Sex', 'Embarked'], dtype='object')] are in the [columns]"

In [9]:
df_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
292,293,0,2,"Levy, Mr. Rene Jacques",male,36.0,0,0,SC/Paris 2163,12.875,D,C
312,313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S
525,526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q
589,590,0,3,"Murdlin, Mr. Joseph",male,28.0,0,0,A./5. 3235,8.05,,S
411,412,0,3,"Hart, Mr. Henry",male,28.0,0,0,394140,6.8583,,Q
691,692,1,3,"Karun, Miss. Manca",female,4.0,0,1,349256,13.4167,,C
586,587,0,2,"Jarvis, Mr. John Denzil",male,47.0,0,0,237565,15.0,,S
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0,,S
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S


## Preparando os dados para a criação do modelo de Machine Learning

In [None]:
#Cópia do dataframe para realizar o PCA 
df_auxiliar = df_train.copy()
df_auxiliar2 = df_train.copy()

model = PCA(n_components=2)
X = model.fit_transform(df_auxiliar)
X_df = pd.DataFrame(X)
X_df.columns = ['X', 'Y']

sns.scatterplot(data=X_df, x='X', y='Y')

In [None]:
#Método de clusterização KMeans utilizando o Elbow Method 
scaler = StandardScaler() 
scaled_df = scaler.fit_transform(df_auxiliar)

alvo = df_auxiliar 

max = 10
km_scores = []
km_silhouette = []
vmeasure_score = []
db_score = []
gm_bic = []
gm_score = []

for i in range(2,max):
    km = KMeans(n_clusters=i, random_state=0).fit(alvo)
    preds = km.predict(alvo)
    
    km_scores.append(-km.score(alvo))

In [None]:
plt.figure(figsize=(8,6))
plt,title('Elbow Method', fontsize=16)
plt.scatter(x=[i for i in range(2,max)], y=km_scores, edgecolor='k')
plt.grid(True)=
plt.xlabel('Número de clusteres', fontsize=14)
plt.ylabel('K-Means score', fontsize=14)
plt.xticks([i for i in range(2,max)], fontsize=14)
plt.yticks(fontsize=16)
plt.show()

In [None]:
#Teste de modelagem com o número de K = 5
clt_kmean = KMeans(n_clusters=5)
clt_kmean.fit(df_auxiliar)
labels = clt_kmean.predict(df_auxiliar)
df_auxiliar['Cluster'] = labels
display(df_auxiliar)

In [None]:
#Plotagem do gráfico para ver a diferença visual entre o número de clusters 
plt.figure(figsize=(15,15))

#Aplicar o KMeans no dataframe 
for i in range(2,6):
    kmean = KMeans(n_clusters=i)
    kmean_list = kmean.fit_predict(df_auxiliar)
    X_df['KMean_' + str(i)] = kmean_list
    plt.subplot(3,3,i-1)
    sns.scatterplot(data=X_df, x='X', y='Y', hue='KMean_'+str(i))

In [None]:
#Método de clusterização MeanShift
clt_ms = MeanShift()
cls_ms.fit(df_auxiliar2)
labels_ms = clt_ms.predict(df_auxiliar2)
df_auxiliar2['Cluster'] = labels_ms

In [None]:
#Plotagem do gráfico do MeanShift com 5 clusters
analyzer = MeanShift(n_jobs=-1)
analyzer.fit(df_auxiliar2)
labels_ms = analyzer.labels_
print(np.unique(labels_ms))

X_df['MShift'] = labels_ms
display(X_df.head())

sns.scatterplot(data=X_df, x='X', y='Y', hue='MShift')