### 1. Import libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Ignore warning messages
import warnings
warnings.filterwarnings("ignore")

### 2. Import dataset    
Link: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(diagnostic)

In [None]:
path = '/Users/mangueira/Library/Mobile Documents/com~apple~CloudDocs/devNilton/projetoFinal_IA4/projectNilton/datasetDiagnostic.csv'
dataExams_v0 = pd.read_csv (path, sep=',', encoding='utf-8')
dataExams_v0.head()

### 3. Data processing

In [None]:
# Dataset
dataExams_v0.shape

In [None]:
# show column "exame_33" 419 null instances (more than 90%):
dataExams_v0.isnull().sum()

In [None]:
# Remove the columns:
## "Id" (non-numeric data); "Diagnostic" (dataset outputs examples) and "Exam_33" (more than 90% NaN)
dataExams_v1 = dataExams_v0.drop(columns=['Id', 'Diagnostic', 'Exam_33'])

dataExams_v1.head()

In [None]:
# Expected output (label):
diagnostic = dataExams_v0.Diagnostic

### 4. Criando modelo de classificação

#### 4.1 Using train_test_split:    
    Split arrays or matrices into random train and test subsets.        
    Link: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# train_test_split
from sklearn.model_selection import train_test_split
from numpy import random

SEED = 123143
random.seed(SEED)

training_x, test_x, training_y, test_y = train_test_split(dataExams_v1, 
                                                          diagnostic,
                                                          test_size = 0.3)

In [None]:
# Printing Training (70%) and Test (30%) data:

print("Training data (70%):", training_x.shape, training_y.shape, "\n"
          "Test data (30%):", test_x.shape, test_y.shape)

In [None]:
# Dataset (examples):
training_x.head()

In [None]:
# Know outputs (labels):
training_y.head()

#### 4.2 Using RandomForestClassifier:    
    A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples
     the dataset and uses averaging to improve the predictive accuracy and control over-fitting.    
    Link: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from numpy import random

# Modelo 1:
classifier = RandomForestClassifier(n_estimators = 100)

classifier.fit(training_x, training_y)

# Reference baseline_1 before reducing data dimensionality using RandomForestClassifier:
print("RandomForestClassifier (reference baseline_1) = %.2f%%" %(classifier.score(test_x,test_y)*100))

#### 4.3 Dummy simple classifyion (DummyClassifier):    
    This classifier serves as a simple baseline to compare against other more complex classifiers.    
    Link: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

In [None]:
from sklearn.dummy import DummyClassifier

SEED = 123143
random.seed(SEED)

classifier_simple1 = DummyClassifier(strategy= "most_frequent")
classifier_simple1.fit(training_x, training_y)

# Simple classification result using Dummy (baseline_2):
print("DummyClassifier (reference baseline_2) = %.2f%%" %(classifier_simple1.score(test_x, test_y)*100))

### Compare Methods (1):

In [None]:
# RandomForestClassifier (reference baseline_1) before reducing data dimensionality using :
print("1 - RandomForestClassifier (reference baseline_1) = %.2f%%""\n" %(classifier.score(test_x,test_y)*100))
# Simple classification result using Dummy (baseline_2):
print("2 - DummyClassifier (referencebaseline_2) = %.2f%%" %(classifier_simple1.score(test_x, test_y)*100))

### 5. Analyze the data for model fits      
Best for graphics visualization

####  5.1 Concatenate dataset     
(Diagnostic + dataExams_v1)

In [None]:
# dataset (Diagnostic + dataExams_v1)
data_plot = pd.concat([diagnostic, dataExams_v1],axis = 1)
data_plot.head()

#### 5.2 Using pandas.melt   
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.   
Link: https://pandas.pydata.org/docs/reference/api/pandas.melt.html

In [None]:

data_plot = pd.melt(data_plot, 
                     id_vars="Diagnostic",
                     var_name="Exams",
                     value_name='values')
data_plot.head(569)

#### 5.3 seaborn.violinplot   
Draw a combination of boxplot and kernel density estimate.    
https://seaborn.pydata.org/generated/seaborn.violinplot.html

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


data_plot = pd.concat([diagnostic, dataExams_v1.iloc[:,0:10]],axis = 1)

data_plot = pd.melt(data_plot,
                    id_vars="Diagnostic",
                    var_name="Exams",
                    value_name='values')

plt.figure(figsize=(10, 10))
sns.violinplot(x = "Exams", y = "values", hue = "Diagnostic",
               data = data_plot)
plt.xticks(rotation = 90)

#### 5.3.1 Using function StandardScaler (standardize)   
Standardize features by removing the mean and scaling to unit variance.   
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [None]:
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()

standard.fit(dataExams_v1)

dataExams_v3 = standard.transform(dataExams_v1)

dataExams_v3

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()

standard.fit(dataExams_v1)

dataExams_v3 = standard.transform(dataExams_v1)

dataExams_v3 = pd.DataFrame(data = dataExams_v3,
                              columns = dataExams_v1.keys())

data_plot = pd.concat([diagnostic, dataExams_v3.iloc[:,0:10]],axis = 1)

data_plot = pd.melt(data_plot,
                    id_vars="Diagnostic",
                    var_name="Exams",
                    value_name='values')

plt.figure(figsize=(10, 10))

sns.violinplot(x = "Exams", y = "values", hue = "Diagnostic",
               data = data_plot, split= True)

plt.xticks(rotation = 90)

In [None]:
def violinPlot(values, inicio, fim):
    
    data_plot = pd.concat([diagnostic, values.iloc[:,inicio:fim]],axis = 1)
    
    data_plot = pd.melt(data_plot,
                        id_vars="Diagnostic",
                        var_name="Exams",
                        value_name='values')
    
    plt.figure(figsize=(20, 20))
    
    sns.violinplot(x = "Exams", y = "values", hue = "Diagnostic",
                   data = data_plot, split= True)
    
    plt.xticks(rotation = 90)

violinPlot(dataExams_v3, 0, 32)

In [None]:
#  3.4 Remove values (Exam_3 e Exam_29)   
## Constant values in this data (line)

dataExams_v4 = dataExams_v3.drop(columns=["Exam_29", "Exam_4"])

dataExams_v4.head()

In [None]:
# ViolinPlot adjusted

def violinPlot(values, inicio, fim):
    
    data_plot = pd.concat([diagnostic, values.iloc[:,inicio:fim]],axis = 1)
    
    data_plot = pd.melt(data_plot,
                        id_vars="Diagnostic",
                        var_name="Exams",
                        value_name='values')
    
    plt.figure(figsize=(20, 20))
    
    sns.violinplot(x = "Exams", y = "values", hue = "Diagnostic",
                   data = data_plot, split= True)
    
    plt.xticks(rotation = 90)

violinPlot(dataExams_v4, 0, 32)

### 6. Classification 

#### 6.1 Repeat RandomForestClassifier (after data adjustments) 

In [None]:
# Classification

def classify (values):
    SEED = 1234
    random.seed(SEED)
   
    training_x, test_x, training_y, test_y = train_test_split(values, 
                                                        diagnostic,
                                                        test_size = 0.3)

    classifier = RandomForestClassifier(n_estimators = 100)
    
    classifier.fit(training_x, training_y)
    
    print("3 - RandomForestClassifier (reference baseline_3) = %.2f%%" %(classifier.score(test_x,test_y)*100))

classify(dataExams_v4)

### Compare Methods (2):

In [None]:
# RandomForestClassifier (reference baseline_1) before reducing data dimensionality using :
print("1 - RandomForestClassifier (reference baseline_1) = %.2f%%" "\n" %(classifier.score(test_x,test_y)*100))

# Simple classification result using Dummy (baseline_2):
print("2 - DummyClassifier (reference baseline_2) = %.2f%%" "\n" %(classifier_simple1.score(test_x, test_y)*100))

# RandomForestClassifier (reference baseline_3) after adjusted data:
classify(dataExams_v4)

### 7. Matrix correlation

#### 7.1 Heat map    
Plot rectangular data as a color-encoded matrix.   
Link: https://seaborn.pydata.org/generated/seaborn.heatmap.html

In [None]:
matrixCorrelation = dataExams_v4.corr()
plt.figure(figsize = (17, 15))
sns.heatmap(matrixCorrelation, annot = True, fmt = ".1f")

In [None]:
matrixCorrelation_v1 = matrixCorrelation[matrixCorrelation>0.99]
matrixCorrelation_v1

In [None]:
matrixCorrelation_v2 = matrixCorrelation_v1.sum()

In [None]:
matrixCorrelation_v2

In [None]:
variaveis_correlacionadas = matrixCorrelation_v2[matrixCorrelation_v2>1]
variaveis_correlacionadas

In [None]:
dataExams_v5 = dataExams_v4.drop(columns=variaveis_correlacionadas.keys())

In [None]:
dataExams_v5

In [None]:
classify(dataExams_v5)

In [None]:
dataExams_v6 = dataExams_v4.drop(columns=["Exam_3", "Exam_24"])
classify(dataExams_v6)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

selecionar_kmelhores = SelectKBest(chi2, k = 5)

In [None]:
training_x, test_x, training_y, test_y = train_test_split(dataExams_v6, 
                                                        diagnostic,
                                                        test_size = 0.3)

selecionar_kmelhores.fit(training_x,training_y)

In [None]:
dataExams_v7 = dataExams_v2.drop(columns=(["Exam_4", "Exam_29", "Exam_3", "Exam_24"]))

In [None]:
SEED= 1234
random.seed(SEED)

training_x, test_x, training_y, test_y = train_test_split(dataExams_v7, 
                                                        diagnostic,
                                                        test_size = 0.3)


selecionar_kmelhores.fit(training_x,training_y)
training_kbest = selecionar_kmelhores.transform(training_x)
test_kbest = selecionar_kmelhores.transform(test_x)

In [None]:
test_kbest.shape

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=1234)
classifier.fit(training_kbest, training_y)
print("Resultado da classificação %.2f%%" %(classifier.score(test_kbest,test_y)*100))

In [None]:
5/33

In [None]:
from sklearn.metrics import confusion_matrix

matriz_confusao = confusion_matrix(test_y,classifier.predict(test_kbest))

In [None]:
plt.figure(figsize = (10, 8))
sns.set(font_scale= 2)
sns.heatmap(matriz_confusao, annot = True, fmt = "d").set(xlabel = "Predição", ylabel= "Real")

In [None]:
from sklearn.feature_selection import RFE

SEED= 1234
random.seed(SEED)

training_x, test_x, training_y, test_y = train_test_split(dataExams_v7, 
                                                        diagnostic,
                                                        test_size = 0.3)

classifier = RandomForestClassifier(n_estimators=100, random_state=1234)
classifier.fit(training_x, training_y)
selecionador_rfe = RFE(estimator = classifier, n_features_to_select = 5, step = 1)
selecionador_rfe.fit(training_x, training_y)
training_rfe = selecionador_rfe.transform(training_x)
test_rfe = selecionador_rfe.transform(test_x)
classifier.fit(training_rfe, training_y)

matriz_confusao = confusion_matrix(test_y,classifier.predict(test_rfe))
plt.figure(figsize = (10, 8))
sns.set(font_scale= 2)
sns.heatmap(matriz_confusao, annot = True, fmt = "d").set(xlabel = "Predição", ylabel= "Real")

print("Resultado da classificação %.2f%%" %(classifier.score(test_rfe,test_y)*100))
