<a href="https://colab.research.google.com/github/crisouzajr/Data-Exploatory-Analysis-HEART-DISEASE/blob/main/PROJETO_DATA_MINING_HORSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Developing Machine Learning Models for DM Project.

## A base de dados consiste na relação de algumas características fisiológicas apresentadas por cavalos, que irão indicar a propensão de que os animais venham a óbito ou não. 

### As imagens abaixo explicam as siglas de cada um dos atributos da base de dados para melhor compreenção do estudo como um todo.

# <font color = 'red'> PRE PROCESSAMENTO DA BASE DE DADOS

### Carregando a base de dados para realizar as primeiras análises exploratória.

In [1]:
"""Importando as bibliotecas iniciais para execução de uma análise exploratória dos dados"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
"""Visualizando o dataset"""

df = pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,,decreased,distend_large,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,,normal,normal,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,,,,,,,,74.0,7.4,,,died,no,4300,0,0,no


In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

<img src = 'cavalo_dourado.jpg'> 

### Pré processamento e tratamento dos dados.

In [None]:
"""Código utilizando para verificar com maior precisão de detalhes, quais são os dados nulos no dataset"""

import missingno as msno

msno.matrix(df, figsize=(25,5));

In [None]:
"""Após verificar a imagem acima, é possível determinar que os atributos: 
1) nasogastric_reflux_ph; 
2) abdomo_appearance;
3) abdomo_protein

Podem ser removidos do dataset, pois possuem mais de 50% de dados faltantes. E para essa tarefa o código abaixo
irá excluir esses atributos."""

"""Após codificação, outras técnicas de tratamento de missing serão realizadas a fim adequar a base de dados para construção
dos modelos preditivos."""

data = df.drop(['nasogastric_reflux_ph','abdomo_appearance','abdomo_protein'], axis = 1)

In [None]:
data.columns

In [None]:
msno.matrix(data, figsize=(25,5));

In [None]:
"""Sequência de códigos usados para verificar quais são as colunas com dados categóricos e quais possuem dados numéricos."""

data.info()

In [None]:
"""Código usado para criar uma cópia do dataframe original apenas com os dados categóricos do dataframe original."""

cat_data = data.select_dtypes(include=['object']).copy()
cat_data.head()

## Codificando a coluna (surgery)

In [None]:
print(data['surgery'].value_counts())

In [None]:
labels = data['surgery'].astype('category').cat.categories.tolist()
replace_map_comp = {'surgery': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)

data.head()

## Codificando a coluna (age)

In [None]:
print(data['age'].value_counts())

In [None]:
labels = data['age'].astype('category').cat.categories.tolist()
replace_map_comp = {'age': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)

data.head()

## Codificando a coluna (temp_of_extremities)

In [None]:
print(data['temp_of_extremities'].value_counts())

In [None]:
labels = data['temp_of_extremities'].astype('category').cat.categories.tolist()
replace_map_comp = {'temp_of_extremities': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)
data.head()

## Codificando a coluna (peripheral_pulse)

In [None]:
print(data['peripheral_pulse'].value_counts())

In [None]:
labels = data['peripheral_pulse'].astype('category').cat.categories.tolist()
replace_map_comp = {'peripheral_pulse': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)
data.head()

## Codificando a coluna (mucous_membrane)

In [None]:
print(data['mucous_membrane'].value_counts())

In [None]:
labels = data['mucous_membrane'].astype('category').cat.categories.tolist()
replace_map_comp = {'mucous_membrane': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)
data.head()

## Codificando a coluna (capillary_refill_time)

In [None]:
print(data['capillary_refill_time'].value_counts())

In [None]:
labels = data['capillary_refill_time'].astype('category').cat.categories.tolist()
replace_map_comp = {'capillary_refill_time': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

data.replace(replace_map_comp, inplace=True)

## Codificando a coluna (pain)

In [None]:
print(data['pain'].value_counts())

In [None]:
labels = data['pain'].astype('category').cat.categories.tolist()
replace_map_comp = {'pain': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)
data.head()

## Codificando a coluna (peristalsis)

In [None]:
"""Como maneira de complementar o conhecimento, o código abaixo é usado para determinar a frequência de repetição
de um determinado valor em um atributo."""

print(data['peristalsis'].value_counts())

In [None]:
"""Representação da frequência através de gráficos"""

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

peristalsis_count = data['peristalsis'].value_counts()
sns.set(style="darkgrid")
sns.barplot(peristalsis_count.index, peristalsis_count.values, alpha=0.9)
plt.title('Frequency Distribution of Peristalsis')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('peristalsis', fontsize=12)
plt.show()

In [None]:
""" A partir do momento em que sabemos quais sao os dados categóricos, é possível codificar os mesmos e verificar
como os mesmos ficarão em um novo dataframe."""

labels = data['peristalsis'].astype('category').cat.categories.tolist()
replace_map_comp = {'peristalsis': {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_map_comp)

In [None]:
data.replace(replace_map_comp, inplace=True)
data.head()

## Codificando a coluna (abdominal_distention)

In [None]:
print(data['abdominal_distention'].value_counts())

In [None]:
label = data['abdominal_distention'].astype('category').cat.categories.tolist()
rep = {'abdominal_distention': {k: v for k,v in zip(label,list(range(1,len(label)+1)))}}

print(rep)

In [None]:
data.replace(rep, inplace=True)
data.head()

## Codificando a coluna (nasogastric_tube)

In [None]:
data['nasogastric_tube'].value_counts()

In [None]:
label = data['nasogastric_tube'].astype('category').cat.categories.tolist()
rep = {'nasogastric_tube': {k: v for k,v in zip(label,list(range(1,len(label)+1)))}}

print(rep)

In [None]:
data.replace(rep, inplace=True)
data.head()

## Codificando a coluna (nasogastric_reflux)

In [None]:
data['nasogastric_reflux'].value_counts()

In [None]:
label = data['nasogastric_reflux'].astype('category').cat.categories.tolist()
rep = {'nasogastric_reflux': {k: v for k,v in zip(label,list(range(1,len(label)+1)))}}

data.replace(rep, inplace=True)

## Codificando a coluna (rectal_exam_feces)

In [None]:
data['rectal_exam_feces'].value_counts()

In [None]:
label = data['rectal_exam_feces'].astype('category').cat.categories.tolist()
rep = {'rectal_exam_feces': {k: v for k, v in zip(label,list(range(1,len(label)+1)))}}

data.replace(rep, inplace=True)

## Codificando a coluna (abdomen)

In [None]:
data['abdomen'].value_counts()

In [None]:
label = data['abdomen'].astype('category').cat.categories.tolist()
rep = {'abdomen': {k: v for k, v in zip(label,list(range(1,len(label)+1)))}}

data.replace(rep, inplace=True)

## Codificando a coluna (outcome)

In [None]:
data['outcome'].value_counts()

In [None]:
label = data['outcome'].astype('category').cat.categories.tolist()
rep = {'outcome': {k: v for k, v in zip(label,list(range(1,len(label)+1)))}}

data.replace(rep, inplace=True)

## Codificando a coluna (surgical_lesion)

In [None]:
data['surgical_lesion'].value_counts()

In [None]:
label = data['surgical_lesion'].astype('category').cat.categories.tolist()

rep = {'surgical_lesion': {k: v for k, v in zip(label,list(range(1,len(label)+1)))}}

data.replace(rep, inplace=True)

## Codificando a coluna (cp_data)

In [None]:
data['cp_data'].value_counts()

In [None]:
label = data["cp_data"].astype('category').cat.categories.tolist()
rep = {'cp_data': {k: v for k, v in zip(label,list(range(1,len(label)+1)))}}

data.replace(rep, inplace=True)

## Verificando todos os atributos após codificação

In [None]:
"""Verificando como ficou a coluna após a codificação"""
data.head()

In [None]:
data.info()

## Verificando dados faltantes após codificação e realizando os devidos tratamentos.

# <font color = 'red'> TRATAMENTO DE MISSING VALUES

In [None]:
# Visualizar missing

import missingno as msno
msno.matrix(X_train, figsize=(25,5));

In [None]:
# Verificar missing por linha

percent_missing_rows = X_train.apply(lambda x: x.isnull().sum() * 100 / len(X_train), axis=1).sort_values(ascending=False)
df = pd.DataFrame({'row_index': X_train.index, 'percent_missing': percent_missing_rows})
df.head()

In [None]:
# remover colunas com variância zero

from sklearn.feature_selection import VarianceThreshold
zero_var_filter = VarianceThreshold()
X_train = zero_var_filter.fit_transform(X_train)

In [None]:
# colunas com variância zero

(zero_var_filter.variances_ == 0).sum()

In [None]:
# novo shape

pd.DataFrame(X_train).shape

# <font color = 'red'> ANÁLISE EXPLORATÓRIA APÓS TRATAMENTO DE MISSING VALUES

# <font color = 'red'> RELATÓRIO ESTATÍSTICO INICIAL

## Gerando Relatório Geral com os dados codificados e pre processados

In [None]:
"""Gerando relatório geral ainda sem realizar nenhum tipo de tratamento direto na base de dados."""

from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Relatório Cavalo')
profile

In [None]:
profile.to_file('Relatório Cardiopático')

## General percentage exploratory analysis of pacient gender and health status.

In [None]:
# Code used to discover the percentage by gender.
df_s = pd.DataFrame(df.sex.value_counts())
df_s

gender = ['MAN', 'WOMAN']

df_s.insert(loc=1,column='gender', value=gender)
df_s.head()


labels = df_s.gender
colors = ['lightskyblue', 'red']
plt.pie(df_s['sex'], labels= labels, colors=colors, startangle=90, autopct='%.1f%%')
plt.show()

In [None]:
# Code used to create a pizza chart to inform the percentage of sick and health pacients

df_t = pd.DataFrame(df.target.value_counts())
df_t

status = ['Sick', 'Healthy']


df_t.insert(loc=1,column='status', value=status)

dfc.head()

labels = df_t.status
colors = ['lightskyblue', 'red']
plt.pie(df_t['target'], labels= labels, colors=colors, startangle=90, autopct='%.1f%%')
plt.show()

## Men's Patient Exploratory Analysis

In [None]:
# Chart used to describe the age of all man pacients.

M = df[df['sex']==1]['age'].values
M

plt.subplots(figsize= (10,9))

sns.countplot(y=M)
plt.title("Amount of Man pacient by age)")
plt.show()

In [None]:
# Chart used to describe the age of all sick man pacients.

MS = df[df['sex']==1][df['target']==1]['age'].values
MS

plt.subplots(figsize= (10,9))

sns.countplot(y=MS)
plt.title("Amount of Man pacient by age)")
plt.show()

In [None]:
# Code used to discover the percentage of sick and healthy man.

df34 = pd.DataFrame(man_df.Health_Status.value_counts())
df34

Result = ['Sick', 'Healthy']

df34.insert(loc=1,column='MAN_Result', value=Result)
df34.head()


labels = df34.MAN_Result 
colors = ['lightskyblue', 'red']
plt.pie(df34['Health_Status'], labels= labels, colors=colors, startangle=90, autopct='%.1f%%')
plt.show()

## Women's Patient Exploratory Analysis

In [None]:
# Code used to count the amout of woman inside the case study.

W = df[df['sex']==0]['age'].values
W

plt.subplots(figsize= (10,9))

sns.countplot(y=W)
plt.title("Amount of Woman pacient by age)")
plt.show()

In [None]:
# Code used to count the amout of woman sick inside the case study.

W = df[df['sex']==0][df['target']==1]['age'].values
W

plt.subplots(figsize= (10,9))

sns.countplot(y=W)
plt.title("Amount of Woman pacient by age)")
plt.show()

In [None]:
# Code used to discover the percentage of sick and healthy woman.

df34 = pd.DataFrame(woman_df.Health_Status.value_counts())
df34

Result = ['Sick', 'Healthy']

df34.insert(loc=1,column='WOMAN_Result', value=Result)
df34.head()

labels = df34.WOMAN_Result 
colors = ['lightskyblue', 'red']
plt.pie(df34['Health_Status'], labels= labels, colors=colors, startangle=90, autopct='%.1f%%')
plt.show()

# <font color = 'red'> DESENVOLVIMENTO DOS MODELOS DE MACHINE LEARNING.

# <font color = red> ÁRVORE DE DECISÃO

In [None]:
# Separation of test and train bases.

import numpy as np
seed = 10000

from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != 'target']    # ENTRADA
Y = df.target, test_size=0.2             # SAÍDA

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=seed) # TENTAR USAR O OPÇÃO ESTRTIFICADA POSTERIORMENTE

print('shape input de treino', X_train.shape)
print('shape input de teste', X_test.shape)
print('shape output de treino', Y_train.shape)
print('shape output de teste', Y_test.shape)

In [None]:
sns.countplot(x=Y_train)
plt.title("Shape output de treino")
plt.show()

In [None]:
sns.countplot(x=Y_test)
plt.title("Shape output de teste")
plt.show()

### First Try (Training Model)

In [None]:
# Training Model

from sklearn.tree import DecisionTreeClassifier


model = DecisionTreeClassifier(random_state=seed)


model.fit(X_train, Y_train);

In [None]:
'''Visualizando a árvore de decisão'''

from sklearn import tree
fig, ax = plt.subplots(figsize=(20,10)) # definir tamanho da imagem a ser gerada
tree.plot_tree(model, class_names=['Doente', 'Saudável'], filled=True, rounded=True); # plota árvore


In [None]:
# Evaluating test Model (ACCURACY)

model.score(X_train, Y_train)

In [None]:
# Evaluating test Model (ACCURACY)

model.score(X_test, Y_test)

### Second Try (Normalization)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=seed, min_samples_leaf=10)
model.fit(X_train, Y_train);

In [None]:
model.score(X_train, Y_train)

In [None]:
model.score(X_test, Y_test)

### Decision Tree Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)


confMatrix = pd.DataFrame(confusion_matrix(y_pred, Y_test), 
                          index=['Foreseen healthy', 'Foreseen sick'], 
                          columns=['Real Healthy', 'Real Sick'])
confMatrix

### Saving model READY TO USE

In [None]:
import pickle
pickle.dump(model, open('model_DT.sav', 'wb'))

### <font color = red> GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Set the parameters by cross-validation

tuned_parameters = [{'criterion': ['gini', 'entropy'], 'max_depth': [2,4,6,8,10,12], 'min_samples_leaf': [1,2,3,4,5,8,10]}]

print("# Tuning hyper-parameters for F1 score")
print()

model = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, scoring='f1')
model.fit(X_train, y_train)

y_true, y_pred = y_test, model.predict(X_test)

print(classification_report(y_true, y_pred))
print()

In [None]:
''' Para usar esse códico, é necessário criar a função que gera a matrix de confusão que está no notebook credito AD.'''

predict_and_evaluate(X_test, y_test)

# <font color = red> RANDOM FOREST

In [None]:
# treinar modelo

from sklearn.ensemble import RandomForestClassifier

def train(X_train, y_train, seed):
  model = RandomForestClassifier(min_samples_leaf=5, random_state=seed) # tente mudar parâmetro para evitar overfitting
  model.fit(X_train, y_train);
  return model

model = train(X_train, y_train, seed)

### Avaliar modelo treinado na base de teste

In [None]:
def predict_and_evaluate(X_test, y_test):

  y_pred = model.predict(X_test) #inferência do teste

  # Acurácia
    
  from sklearn.metrics import accuracy_score
  accuracy = accuracy_score(y_test, y_pred)
  print('Acurácia: ', accuracy)

  # Kappa
    
  from sklearn.metrics import cohen_kappa_score
  kappa = cohen_kappa_score(y_test, y_pred)
  print('Kappa: ', kappa)

  # F1
    
  from sklearn.metrics import f1_score
  f1 = f1_score(y_test, y_pred)
  print('F1: ', f1)

  # Matriz de confusão
    
  from sklearn.metrics import confusion_matrix
  confMatrix = confusion_matrix(y_pred, y_test)

  ax = plt.subplot()
  sns.heatmap(confMatrix, annot=True, fmt=".0f")
  plt.xlabel('Real')
  plt.ylabel('Previsto')
  plt.title('Matriz de Confusão')

  # Colocar os nomes
    
  ax.xaxis.set_ticklabels(['Não Pagou', 'Pagou']) 
  ax.yaxis.set_ticklabels(['Não Pagou', 'Pagou'])
  plt.show()

predict_and_evaluate(X_test, y_test)

### Normalizar

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Treinar e Testar novamente

model = train(X_train, y_train, seed)
predict_and_evaluate(X_test, y_test)

## Saving model READY TO USE

In [None]:
import pickle
pickle.dump(model, open('model_RF.sav', 'wb'))

# <font color = red> KNN

In [None]:
# treinar modelo

from sklearn.neighbors import KNeighborsClassifier

def train(X_train, y_train, n_neighbors=5):
  model = KNeighborsClassifier(n_neighbors=n_neighbors)
  model.fit(X_train, y_train);
  y_pred = model.predict(X_test) #inferência do teste

  return model, y_pred

model, y_pred = train(X_train, y_train)

In [None]:
def predict_and_evaluate(y_test, y_pred):  

  # Acurácia

  from sklearn.metrics import accuracy_score
  accuracy = accuracy_score(y_test, y_pred)
  print('Acurácia: ', accuracy)

  # Kappa

  from sklearn.metrics import cohen_kappa_score
  kappa = cohen_kappa_score(y_test, y_pred)
  print('Kappa: ', kappa)

  # F1

  from sklearn.metrics import f1_score
  f1 = f1_score(y_test, y_pred, average='weighted')
  print('F1: ', f1)

  # Matriz de confusão

  from sklearn.metrics import confusion_matrix
  confMatrix = confusion_matrix(y_pred, y_test)

  ax = plt.subplot()
  sns.heatmap(confMatrix, annot=True, fmt=".0f")
  plt.xlabel('Real')
  plt.ylabel('Previsto')
  plt.title('Matriz de Confusão')

  # Colocar os nomes

  ax.xaxis.set_ticklabels(['Benigno', 'Maligno']) 
  ax.yaxis.set_ticklabels(['Benigno', 'Maligno'])
  plt.show()

predict_and_evaluate(y_test, y_pred)

### Normalizar

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Treinar e Testar novamente

model, y_pred = train(X_train, y_train)
predict_and_evaluate(y_test, y_pred)

## Saving model READY TO USE

In [None]:
import pickle
pickle.dump(model, open('model_KNN.sav', 'wb'))

# <font color = red> SVM

## Pré Processamento

In [None]:
# Visualizar missing

import missingno as msno
msno.matrix(X_train, figsize=(25,5));

In [None]:
# Verificar missing por linha

percent_missing_rows = X_train.apply(lambda x: x.isnull().sum() * 100 / len(X_train), axis=1).sort_values(ascending=False)
df = pd.DataFrame({'row_index': X_train.index, 'percent_missing': percent_missing_rows})
df.head()

In [None]:
# remover colunas com variância zero

from sklearn.feature_selection import VarianceThreshold
zero_var_filter = VarianceThreshold()
X_train = zero_var_filter.fit_transform(X_train)

In [None]:
# colunas com variância zero

(zero_var_filter.variances_ == 0).sum()

In [None]:
# novo shape

pd.DataFrame(X_train).shape

### Treinando o modelo SVM

In [None]:
# treinar modelo

from sklearn.svm import SVC

def trainSVM(X_train, y_train, seed):
  model = SVC(random_state=seed)
  model.fit(X_train, y_train)
  return model

model = trainSVM(X_train, y_train, seed)

In [None]:
# remover as mesmas colunas removidas por variância zero. Nesse caso não houveram tais atributos. 
# Deixo o código caso tenham um caso que ocorra. Aqui não fará nenhuma diferença.

X_test = zero_var_filter.transform(X_test)
X_test.shape

In [None]:
def predict_and_evaluate(X_test, y_test):

  y_pred = model.predict(X_test) #inferência do teste

  # Acurácia
    
  from sklearn.metrics import accuracy_score
  accuracy = accuracy_score(y_test, y_pred)
  print('Acurácia: ', accuracy)

  # Kappa
    
  from sklearn.metrics import cohen_kappa_score
  kappa = cohen_kappa_score(y_test, y_pred)
  print('Kappa: ', kappa)

  # F1
    
  from sklearn.metrics import f1_score
  f1 = f1_score(y_test, y_pred)
  print('F1: ', f1)

  # Matriz de confusão
    
  from sklearn.metrics import confusion_matrix
  confMatrix = confusion_matrix(y_pred, y_test)

  ax = plt.subplot()
  sns.heatmap(confMatrix, annot=True, fmt=".0f")
  plt.xlabel('Real')
  plt.ylabel('Previsto')
  plt.title('Matriz de Confusão')

  # Colocar os nomes
    
  ax.xaxis.set_ticklabels(['Não Pagou', 'Pagou']) 
  ax.yaxis.set_ticklabels(['Não Pagou', 'Pagou'])
  plt.show()

predict_and_evaluate(X_test, y_test)

### Normalizar

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Treinar e Testar novamente

model = trainSVM(X_train, y_train, seed)
predict_and_evaluate(X_test, y_test)

## Saving model READY TO USE

In [None]:
import pickle
pickle.dump(model, open('model_SVM.sav', 'wb'))