In [None]:
#
# Atividade de Aprofundamento 
# SVM Anomaly detection on breast-cancer-wisconsin
#
# Tarefa. Aprenda a utilizar o SVM para a detec��o de anomalias
#
# Anomaly Detection (One Class SVM) in R with MicrosoftML
# https://tsmatz.wordpress.com/2017/04/03/r-anomaly-detection-one-class-support-vector-machine-with-microsoftml-rxoneclasssvm/
#
# Nesta atividade voc� far� algo semelhante ao que pode encontrar no artigo 
# acima que emprega o MicrosoftML ( https://mran.microsoft.com/packages ) 
# core de Machine Learning em R da Microsoft.
#
# Aqui vamos empregar o SVM de um modo diferente do utilizado no eBook e,  
# portanto, aprofundar nossos conhecimentos. Mas n�o se assuste! O 
# exerc�cio �, apesar de tudo, bastante pr�tico.
#
# Vamos utilizar para a detec��o de anomalias. Isso significa identificar
# algum tipo de desvio dos dados.
#
# Isso funciona resumidamente em 2 passos:
#
# 1. O SVM classifica os dados considerados normais em 2 grupos 
# criando a maior margem poss�vel entre eles (trabalha aqui de modo n�o
# supervisionado, mas n�o confunda - a SVM � de aprendizado supervisionado!)
#
# No nosso caso, os casos normais ser�o os B(enignos) = 2
#
# 2. Tendo aprendido o conjunto de dados normais 
# ( imagine uma bola de dados em que a margem divide em 2 conjuntos o melhor
# poss�vel) a SVM pode ent�o predizer os valores que fogem desse 
# padr�o (se distanciando da margem mais que o normal).

In [None]:
install.packages("e1071")
library(e1071)
library(ggplot2) 

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘proxy’




In [None]:
RNGversion("3.5.2")
set.seed(1987)

#
# read breast-cancer-wisconsin
#
wdbc = read.csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",
  col.names=c(
    "patientid",
    "outcome",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concavepoints_mean",
    "symmetry_mean",
    "fractaldimension_mean",
    "radius_error",
    "texture_error",
    "perimeter_error",
    "area_error",
    "smoothness_error",
    "compactness_error",
    "concavity_error",
    "concavepoints_error",
    "symmetry_error",
    "fractaldimension_error",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concavepoints_worst",
    "symmetry_worst",
    "fractaldimension_worst"))

# Atualizado em 2020-07-09
wdbc = as.data.frame(unclass(wdbc),stringsAsFactors=T)

“non-uniform 'Rounding' sampler used”


In [None]:
head(wdbc)
summary(wdbc)
any(is.na(wdbc))

Unnamed: 0_level_0,patientid,outcome,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concavepoints_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concavepoints_worst,symmetry_worst,fractaldimension_worst
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,⋯,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368


   patientid         outcome  radius_mean      texture_mean   perimeter_mean  
 Min.   :     8670   B:357   Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
 1st Qu.:   869222   M:211   1st Qu.:11.697   1st Qu.:16.18   1st Qu.: 75.14  
 Median :   906157           Median :13.355   Median :18.86   Median : 86.21  
 Mean   : 30423820           Mean   :14.120   Mean   :19.31   Mean   : 91.91  
 3rd Qu.:  8825022           3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:103.88  
 Max.   :911320502           Max.   :28.110   Max.   :39.28   Max.   :188.50  
   area_mean      smoothness_mean   compactness_mean  concavity_mean   
 Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
 1st Qu.: 420.2   1st Qu.:0.08629   1st Qu.:0.06481   1st Qu.:0.02954  
 Median : 548.8   Median :0.09587   Median :0.09252   Median :0.06140  
 Mean   : 654.3   Mean   :0.09632   Mean   :0.10404   Mean   :0.08843  
 3rd Qu.: 782.6   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.12965  
 Max.   :2501.0

In [None]:
#
# Q2 Que atributo deve ser eliminado do aprendizado?
#
# Crie o conjunto de dados eliminando esse atributo 
#
mybreast = wdbc[,!(names(wdbc) %in% c("patientid"))]
head(mybreast)

Unnamed: 0_level_0,outcome,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concavepoints_mean,symmetry_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concavepoints_worst,symmetry_worst,fractaldimension_worst
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,⋯,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368


In [None]:
#
# Converta os valores de outcome para factor num�ricos # B = 0, M = 1

mybreast$outcome = as.factor(as.numeric(mybreast$outcome) - 1)


In [None]:
# Separe os dados normais, Benignos dos Malignos
#
mybreast_B = mybreast[mybreast$outcome == 0,]
mybreast_M = mybreast[mybreast$outcome == 1,]

# Check que o nr de linhas bate com os valores B / M 
# 
nrow(mybreast_B) 
nrow(mybreast_M)

### Q4 -> Treinar SVM

In [None]:
help(svm)

In [None]:
#
# Passo 1. Treine a SVM informando os dados normais (benignos)
#
# a. Informe a formula e o data frame dos dados normais (benignos)
# b. O type="one-classification" � para fazer o aprendizado dos dados normais (um s� tipo) 
#
# c. Consulte o help(svm), experimente diferentes tipos de kernel e degree, e responda as 
# questoes de 4 a 6 
# 

svm = svm(outcome ~ ., data = mybreast_B, 
           scale = TRUE, kernel ="radial",
           type="one-classification")


summary(svm)

“‘-’ not meaningful for factors”



Call:
svm(formula = outcome ~ ., data = mybreast_B, kernel = "radial", 
    type = "one-classification", scale = TRUE)


Parameters:
   SVM-Type:  one-classification 
 SVM-Kernel:  radial 
      gamma:  0.03333333 
         nu:  0.5 

Number of Support Vectors:  182




Number of Classes: 1





In [None]:
# Passo 2. Avalie novos dados para a detec��o de anomalias
#
# a. Informe os casos anormais (malignos) para a predi��o
#

#
# Q5. Do total de casos anomalos (malignos) quantos s�o identificados 
# corretamente como anomalia e quantos n�o foram identificados
# para um Kernel Radial?
#
# Q6. Que Kernels apresentaram melhores resultados na detec��o de 
# das anomalias?
#

predict_test = predict(svm, mybreast_M) 
table(predict_test)

cat('Anomaly Detected (FALSE):', table(predict_test)[1]/sum(table(predict_test))*100, ' %')


predict_test
FALSE  TRUE 
  209     2 

Anomaly Detected (FALSE): 99.05213  %

In [None]:
#
# Aplique agora um um modelo de aprendizado supervisionado normal para os valores de outcome
# (para classificar tanto os casos benignos como malignos) com fun��o de base radial.
#
# Aqui, por simplicidadem, n�o vamos separar dados de treinamento e teste e voc�
# pode empregar todo o conjunto de exemplos. 
#
# DICA: empregue a SVM do mesmo modo que voc� empregaria uma �rvore de decis�o ou um modelo
# naive bayes

# 
# Q7. Quantos vetores de suporte foram criados

svm2 = svm(outcome ~ ., data = mybreast, 
          scale = TRUE, 
		  kernel = "radial")

summary(svm2)



Call:
svm(formula = outcome ~ ., data = mybreast, kernel = "radial", scale = TRUE)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  119

 ( 60 59 )


Number of Classes:  2 

Levels: 
 0 1




In [50]:
# Q8. Fa�a a predi��o somente dos casos malignos e verifique a acuracidade obtida

predict_test2 = predict(svm2,mybreast_M)


predict_test = predict(svm, my_breast_test) 

c_matrix=table(my_breast_M$outcome, predict_test2)
print(c_matrix)

acc = sum(diag(c_matrix))/sum(c_matrix)*100
cat('Accuracy: ', acc, ' %', "\n")

ERROR: ignored