<a href="https://colab.research.google.com/github/fernandovieira1/estat_ML/blob/main/algoritmos_classificacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

**Carregar dados**

In [2]:
df = pd.read_csv('/content/credit_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   clientid  2000 non-null   int64  
 1   income    2000 non-null   float64
 2   age       1997 non-null   float64
 3   loan      2000 non-null   float64
 4   default   2000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 78.2 KB


**AED / DW**

In [4]:
df.dropna(inplace=True)

In [5]:
df

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [6]:
x = df.iloc[:, 1:4].values

In [7]:
y = df.iloc[:, 4]

**Modelos de DS**

In [8]:
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from scipy import stats

In [9]:
# 30 testes diferentes para cada um dos três algoritmos

resultados_naive_bayes = []
resultados_logistica = []
resultados_random_forest = []

for i in range(30):
  x_treinamento, x_teste, y_treinamento, y_teste = train_test_split(x, y, 
                                                                    test_size=0.2,
                                                                    stratify=y,
                                                                    random_state=i)
  naive_bayes = GaussianNB()
  naive_bayes.fit(x_treinamento, y_treinamento)
  resultados_naive_bayes.append(accuracy_score(y_teste, naive_bayes.predict(x_teste)))

  logistica = LogisticRegression()
  logistica.fit(x_treinamento, y_treinamento)
  resultados_logistica.append(accuracy_score(y_teste, logistica.predict(x_teste)))

  random_forest = RandomForestClassifier()
  random_forest.fit(x_treinamento, y_treinamento)
  resultados_random_forest.append(accuracy_score(y_teste, random_forest.predict(x_teste)))


In [10]:
print(resultados_naive_bayes)

[0.925, 0.925, 0.9325, 0.925, 0.92, 0.905, 0.9175, 0.9175, 0.9125, 0.9325, 0.9225, 0.9125, 0.935, 0.9175, 0.925, 0.9175, 0.9275, 0.92, 0.9325, 0.92, 0.93, 0.905, 0.9175, 0.9325, 0.9425, 0.9375, 0.94, 0.92, 0.935, 0.925]


In [11]:
print(resultados_logistica)

[0.9325, 0.91, 0.9125, 0.9225, 0.9075, 0.89, 0.91, 0.9075, 0.8775, 0.915, 0.9175, 0.9, 0.925, 0.9175, 0.9025, 0.9125, 0.9525, 0.91, 0.9225, 0.9075, 0.925, 0.905, 0.9075, 0.945, 0.9225, 0.9275, 0.9225, 0.9175, 0.91, 0.9]


In [12]:
print(resultados_random_forest)

[0.975, 0.99, 0.9825, 0.9925, 0.975, 0.98, 0.9875, 0.9875, 0.9775, 0.985, 0.9825, 0.9775, 0.975, 0.985, 0.98, 0.9775, 0.9875, 0.98, 0.9925, 0.9875, 0.9775, 0.9825, 0.98, 0.985, 0.9875, 0.995, 0.9925, 0.985, 0.99, 0.99]


In [13]:
resultados_naive_bayes = np.array(resultados_naive_bayes)
resultados_logistica = np.array(resultados_logistica)
resultados_random_forest = np.array(resultados_random_forest)

In [14]:
# Média
resultados_naive_bayes.mean(), resultados_logistica.mean(), resultados_random_forest.mean()

(0.92425, 0.9145, 0.9840833333333333)

In [15]:
# Moda
stats.mode(resultados_naive_bayes), stats.mode(resultados_logistica), stats.mode(resultados_logistica)

(ModeResult(mode=array([0.9175]), count=array([5])),
 ModeResult(mode=array([0.9075]), count=array([4])),
 ModeResult(mode=array([0.9075]), count=array([4])))

In [16]:
# Mediana
np.median(resultados_naive_bayes), np.median(resultados_logistica), np.median(resultados_random_forest)

(0.925, 0.9125, 0.985)

In [17]:
# Variância
np.var(resultados_naive_bayes), np.var(resultados_logistica), np.var(resultados_random_forest)

(8.756250000000001e-05, 0.00020933333333333337, 3.3118055555555657e-05)

In [18]:
np.min([8.756250000000001e-05, 0.00020933333333333337, 3.3118055555555657e-05]) 
# random forest é o que possui menor variância, logo, o mais consistente

3.3118055555555657e-05

In [19]:
np.max([8.756250000000001e-05, 0.00020933333333333337, 3.3118055555555657e-05])

0.00020933333333333337

In [None]:
# Buscar algoritmos que dêem acurácia máxima e variância mínima.

In [20]:
 # desvio Padrão
 np.std(resultados_naive_bayes), np.std(resultados_logistica), np.std(resultados_random_forest)

(0.00935748363610645, 0.014468356276140472, 0.005754828890206524)

In [21]:
# Coeficiente de variação
stats.variation(resultados_naive_bayes)*100, stats.variation(resultados_logistica)*100, stats.variation(resultados_random_forest)*100, 

(1.0124407504578252, 1.5821056616884057, 0.5847908094036607)