<a href="https://colab.research.google.com/github/eutiagovski/projetos-cursos/blob/main/datascience-mentorama/14_Deteccao_Anomalias_exercicio_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#Detecção de Anomalias



<br>

##Exercício 1: Utilizando a classe DetectorAnomalias criada ao longo do módulo, __vamos avaliar um detector de anomalias.__

O dataset utilizado pode ser importado através da função getData. 

Nesse conjunto de dados, possuímos 6 variáveis explicativas, $X_1, .., X_6$ e uma variável com a marcação se a instância é uma anomalia ou não.

Utilizando a __metodolodia__ discutida ao longo do módulo, __teste diferentes modelos (variando o limiar $\epsilon$)__ a fim de encontrar o que __melhor fita os dados.__

Justifique as escolhas do $\epsilon$, bem como quais as métricas de performance abordadas. 

<br>



In [8]:
import pandas as pd 
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator

In [9]:
class DetectorAnomalias(BaseEstimator):
    
    def __init__(self, epsilon):
        self.epsilon = epsilon
        
    def fit(self, X, y=None):
        medias = X.mean(axis = 0)
        desvios = X.std(axis = 0)
        gaussianas = [st.norm(loc = m, scale = d) for m, d in zip(medias, desvios)]  
        self.gaussianas = gaussianas
        self.X = X
        
    def prob(self, x):
        p = 1
        for i in range(self.X.shape[1]):
            gaussiana_i = self.gaussianas[i]
            x_i = x[i]
            p *= gaussiana_i.pdf(x_i)
        return p
    
    def isAnomaly(self, x):
        return int(np.where(self.prob(x) < self.epsilon, 1, 0))

In [10]:
def getData():
    return pd.read_csv("dataframe_anomalias_exercicio.csv")

In [11]:
df = getData()
df

Unnamed: 0,x1,x2,x3,x4,x5,x6,anomalia
0,7.731153,23.299155,-0.367453,4.715372,9.306179,16.780965,0.0
1,11.466833,16.943695,-0.245131,7.060311,10.462826,19.821289,0.0
2,11.501272,20.196011,1.206049,-4.957189,7.771262,19.100079,0.0
3,10.893921,16.072385,2.738045,-3.684228,7.373334,23.225524,0.0
4,10.091706,19.253894,0.996895,-9.504052,8.883988,17.903298,0.0
...,...,...,...,...,...,...,...
10095,11.192286,18.451987,-0.953650,-14.362996,10.875826,17.056541,0.0
10096,12.014177,19.461815,1.985099,-7.119190,11.079922,17.582755,0.0
10097,10.745460,18.175951,0.206037,-1.897015,9.888329,17.963324,0.0
10098,9.893969,22.333270,-1.465981,4.137382,7.690620,21.570097,0.0


In [12]:
df.anomalia.value_counts()

0.0    10046
1.0       54
Name: anomalia, dtype: int64

In [14]:
# separando os dados em treino, teste e validação
from sklearn.model_selection import train_test_split

df_anomalia = df.loc[df.anomalia == 1]
df_nao_anomalia = df.loc[df.anomalia == 0]

train_set, test_set = train_test_split(df_nao_anomalia, train_size=0.7)
test_set, val_set = train_test_split(test_set, test_size=0.5)

anomalia_size = int(df_anomalia.shape[0] / 2)

test_set = pd.concat([test_set, df_anomalia[anomalia_size:]])
val_set = pd.concat([val_set, df_anomalia[:anomalia_size]])

In [15]:
# Separando a variável target
Xtrain = train_set.drop('anomalia', axis=1).values
ytrain = train_set.anomalia.copy().values

Xtest = test_set.drop('anomalia', axis=1).values
ytest = test_set.anomalia.copy().values

Xval = val_set.drop('anomalia', axis=1).values
yval = val_set.anomalia.copy().values

In [45]:
# Testanto o algoritimo com um epsilon qualquer

detector = DetectorAnomalias(epsilon=1e-7)

In [46]:
detector.fit(Xtrain)
detector.isAnomaly(Xtest[0,:]), ytest[0]

(0, 0.0)

In [47]:
# Realizando o teste para todas as linhas
from sklearn.metrics import accuracy_score, r2_score

anomaly_test = []

for i in range(Xtest.shape[0]):
  anomaly_test.append(detector.isAnomaly(Xtest[i]))

accuracy_score(ytest, anomaly_test).round(2)

0.96

In [44]:
# Realizando a validacao para todas as linhas

anomaly_val = []

for i in range(Xval.shape[0]):
  anomaly_val.append(detector.isAnomaly(Xval[i]))

accuracy_score(yval, anomaly_val).round(2)

0.98

In [26]:
# Testando outros valores de epsilon no ds teste

ep_list = [1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
acc_list = []

for i in range(len(ep_list)):
  epsilon = ep_list[i]

  detector = DetectorAnomalias(epsilon=epsilon)
  detector.fit(Xtrain)
  anomaly_test = []

  for i in range(Xtest.shape[0]):
    anomaly_test.append(detector.isAnomaly(Xtest[i]))

  acc_list.append(f"Epsilon: {epsilon}, Accuracy Score: {accuracy_score(ytest, anomaly_test).round(2)}, R2 Score: {r2_score(ytest, anomaly_test).round(2)}")

acc_list

['Epsilon: 1e-05, Accuracy Score: 0.33, R2 Score: -37.53',
 'Epsilon: 1e-06, Accuracy Score: 0.81, R2 Score: -9.9',
 'Epsilon: 1e-07, Accuracy Score: 0.96, R2 Score: -1.22',
 'Epsilon: 1e-08, Accuracy Score: 1.0, R2 Score: 0.96',
 'Epsilon: 1e-09, Accuracy Score: 0.99, R2 Score: 0.17']

In [27]:
# Validando outros valores de epsilon no ds validacao

ep_list = [1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
acc_list = []

for i in range(len(ep_list)):
  epsilon = ep_list[i]

  detector = DetectorAnomalias(epsilon=epsilon)
  detector.fit(Xtrain)
  anomaly_test = []

  

  for i in range(Xval.shape[0]):
    anomaly_test.append(detector.isAnomaly(Xval[i]))

  acc_list.append(f"Epsilon: {epsilon}, Accuracy Score: {accuracy_score(yval, anomaly_test).round(2)}, R2 Score: {r2_score(yval, anomaly_test).round(2)}")

acc_list

['Epsilon: 1e-05, Accuracy Score: 0.33, R2 Score: -37.83',
 'Epsilon: 1e-06, Accuracy Score: 0.82, R2 Score: -9.29',
 'Epsilon: 1e-07, Accuracy Score: 0.98, R2 Score: -0.32',
 'Epsilon: 1e-08, Accuracy Score: 1.0, R2 Score: 0.96',
 'Epsilon: 1e-09, Accuracy Score: 0.99, R2 Score: 0.21']

In [48]:
# Definindo o melhor valor de epsilon segundo a validação com outros valores

detector = DetectorAnomalias(epsilon=1e-7)

detector.fit(Xtrain)
detector.isAnomaly(Xtest[0,:]), ytest[0]

anomaly_test = []

for i in range(Xtest.shape[0]):
  anomaly_test.append(detector.isAnomaly(Xtest[i]))

accuracy_score(ytest, anomaly_test).round(3)

0.962

In [49]:
# Realizando uma validação cruzada com valores aleatórios 

list_true = []
list_pred = []

for i in range(0, 200):
  n_alea = np.random.randint(0, Xval.shape[0])

  ytrue = ytest[n_alea]
  ypred = detector.isAnomaly(Xtest[n_alea, :])

  print(n_alea, ytrue, ypred)

  list_true.append(ytrue)
  list_pred.append(ypred)

print()
print(f'{accuracy_score(list_true, list_pred).round(3)}')

572 0.0 0
804 0.0 0
828 0.0 0
1448 0.0 1
24 0.0 0
918 0.0 0
1344 0.0 0
427 0.0 0
1161 0.0 0
793 0.0 0
1340 0.0 0
822 0.0 0
239 0.0 0
763 0.0 0
53 0.0 0
1085 0.0 0
955 0.0 0
1044 0.0 0
1152 0.0 0
3 0.0 0
1211 0.0 1
1264 0.0 0
1053 0.0 0
1487 0.0 0
109 0.0 0
205 0.0 0
1202 0.0 0
193 0.0 0
617 0.0 0
1287 0.0 1
999 0.0 0
405 0.0 0
891 0.0 0
1224 0.0 0
57 0.0 0
373 0.0 0
1367 0.0 0
1173 0.0 0
877 0.0 1
1324 0.0 0
1419 0.0 0
735 0.0 0
300 0.0 0
749 0.0 0
1387 0.0 0
1429 0.0 0
1403 0.0 0
1122 0.0 0
174 0.0 0
420 0.0 0
781 0.0 0
1220 0.0 0
1471 0.0 0
480 0.0 0
247 0.0 0
645 0.0 0
883 0.0 0
406 0.0 0
165 0.0 0
1447 0.0 0
181 0.0 0
931 0.0 0
40 0.0 0
520 0.0 0
1215 0.0 0
1357 0.0 0
731 0.0 0
560 0.0 0
817 0.0 0
865 0.0 0
58 0.0 0
1229 0.0 0
475 0.0 0
332 0.0 0
1360 0.0 0
959 0.0 0
388 0.0 0
519 0.0 0
456 0.0 0
920 0.0 0
912 0.0 0
1333 0.0 0
511 0.0 0
1516 1.0 1
173 0.0 0
708 0.0 0
498 0.0 0
349 0.0 0
373 0.0 0
816 0.0 0
758 0.0 0
377 0.0 0
583 0.0 0
367 0.0 0
914 0.0 0
1233 0.0 0
931 0.0 0
68 0.

###Conclusão 1: 

Embasado nas análiases e testes acima, podemos considerar que o melhor valor de epsion para o problema citado é de 1e-7.

Essa conclusão foi baseada nos resultados do score de acurácia, ou seja, precisão do algoritimo em classificar corretamente os dados.

A precisão do algoritimo durante os testes foi de 96,2%, e de 96,5% durante a validação.

##Exercício 2: Aborde o problema num contexto de aprendizado supervisionado, ou seja, treine modelos de classificação binária com o objetivo de detectar anomalias.

Compare os resultados entre as metodologias.

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

#instanciando os modelos
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

estimators = [('lr', logreg), ('knn', knn), ('dt', dt)]

voting = VotingClassifier(estimators = estimators)
voting.fit(Xtest, ytest)



VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('knn', KNeighborsClassifier()),
                             ('dt', DecisionTreeClassifier())])

In [36]:
accuracy_score(y_true = ytest, y_pred = voting.predict(Xtest))

0.9830508474576272

In [37]:
list_true = []
list_pred = []

for i in range(0, 200):
  n_alea = np.random.randint(0, Xtest.shape[0])

  ytrue = ytest[n_alea]
  ypred = voting.predict(Xtest[n_alea:n_alea+1, :])

  print(n_alea, ytrue, ypred)
  
  list_true.append(ytrue)
  list_pred.append(ypred)

accuracy_score(list_true, list_pred)

648 0.0 [0.]
448 0.0 [0.]
815 0.0 [0.]
1148 0.0 [0.]
1217 0.0 [0.]
74 0.0 [0.]
555 0.0 [0.]
857 0.0 [0.]
276 0.0 [0.]
172 0.0 [0.]
371 0.0 [0.]
748 0.0 [0.]
230 0.0 [0.]
1065 0.0 [0.]
246 0.0 [0.]
30 0.0 [0.]
366 0.0 [0.]
1316 0.0 [0.]
634 0.0 [0.]
1456 0.0 [0.]
379 0.0 [0.]
39 0.0 [0.]
706 0.0 [0.]
663 0.0 [0.]
968 0.0 [0.]
432 0.0 [0.]
656 0.0 [0.]
194 0.0 [0.]
344 0.0 [0.]
343 0.0 [0.]
1088 0.0 [0.]
459 0.0 [0.]
935 0.0 [0.]
391 0.0 [0.]
1032 0.0 [0.]
1368 0.0 [0.]
1257 0.0 [0.]
601 0.0 [0.]
848 0.0 [0.]
859 0.0 [0.]
385 0.0 [0.]
149 0.0 [0.]
1522 1.0 [0.]
147 0.0 [0.]
819 0.0 [0.]
515 0.0 [0.]
426 0.0 [0.]
379 0.0 [0.]
1029 0.0 [0.]
565 0.0 [0.]
338 0.0 [0.]
260 0.0 [0.]
1047 0.0 [0.]
1115 0.0 [0.]
816 0.0 [0.]
581 0.0 [0.]
1384 0.0 [0.]
914 0.0 [0.]
730 0.0 [0.]
1430 0.0 [0.]
568 0.0 [0.]
78 0.0 [0.]
744 0.0 [0.]
165 0.0 [0.]
174 0.0 [0.]
377 0.0 [0.]
10 0.0 [0.]
693 0.0 [0.]
447 0.0 [0.]
875 0.0 [0.]
19 0.0 [0.]
279 0.0 [0.]
505 0.0 [0.]
87 0.0 [0.]
681 0.0 [0.]
1397 0.0 [0.]
664

0.995

In [38]:
voting.predict(Xtest[1:2, :])[0], ytest[1]

(0.0, 0.0)

In [42]:
list_true = []
list_pred = []

for i in range(0, 20):
  n_alea = np.random.randint(0, Xval.shape[0])

  ytrue = yval[n_alea]
  ypred = voting.predict(Xval[n_alea:n_alea+1, :])

  print(n_alea, ytrue, ypred)
  
  list_true.append(ytrue)
  list_pred.append(ypred)

accuracy_score(list_true, list_pred)

1150 0.0 [0.]
650 0.0 [0.]
130 0.0 [0.]
1413 0.0 [0.]
187 0.0 [0.]
441 0.0 [0.]
95 0.0 [0.]
425 0.0 [0.]
1145 0.0 [0.]
261 0.0 [0.]
22 0.0 [0.]
273 0.0 [0.]
752 0.0 [0.]
538 0.0 [0.]
247 0.0 [0.]
166 0.0 [0.]
141 0.0 [0.]
529 0.0 [0.]
264 0.0 [0.]
1002 0.0 [0.]


1.0

In [43]:
from sklearn.model_selection import cross_val_score

print(f"Cross val Accuracy em teste: {cross_val_score(voting, Xtest, ytest, scoring='accuracy', cv=5).mean().round(3)}%")
print(f"Cross val Accuracy em validacao: {cross_val_score(voting, Xval, yval, scoring='accuracy', cv=5).mean().round(3)}%")

Cross val Accuracy em teste: 0.982%
Cross val Accuracy em validacao: 0.982%


###Conclusão 2:

O algoritmo de aprendizagem supervisonada apresentou maior performance em relação ao algoritmo de detecção de anomalias.

Utilizando a mesma métrica de performance, no contexto supervisinado conseguimos uma acurária de 98,2% em testes, e de 99,5% como resultado da validação cruzada, tornando assim o melhor algoritmo para a previsão de anomalias neste contexto.