# Exemplo com seleção de características
- Base: Breast Cancer
- Link: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data


# Data Analysis

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import matplotlib.pyplot as plt
import time
from subprocess import check_output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
data = pd.read_csv('cancer.csv')

Before making anything like feature selection, feature extraction and classification, we start with basic data analysis. 
Lets look at features of data.

In [None]:
data.head()  # head method shows only first 5 rows

In [None]:
# feature names as an Index (panda object including a list of column names and dtype)
col = data.columns       # .columns gives columns names in data 
print(col)

## Remoção de atributos irrelevantes

In [None]:
data.columns

In [None]:
# y includes our labels and x includes our features
y = data['diagnosis']                   # M or B 
#lista = ['Unnamed: 32','id','diagnosis']
data.drop(['Unnamed: 32','id','diagnosis'],axis = 1, inplace=True)

In [None]:
x = data

In [None]:
type(x)

## Distruição das amostras entre as classes

In [None]:
sns.set(style="darkgrid")
ax = sns.countplot(x = y)       # M = 212, B = 357
B, M = y.value_counts(sort=True) #sort=True (default) in order to be sure B, M are in the right order (alphabetical)
print('Number of Benign: ',B)
print('Number of Malignant : ',M)

### Verifica se há dados faltantes

In [None]:
#check if there is a NaN value in our data frame x, a False indicates there are no missing values
x.isnull().values.any()

Let's now use the describe function in order to look at our features :

In [None]:
x.describe()

## Normalização dos dados

In [None]:
from sklearn.preprocessing import MinMaxScaler

# create a scaler object
scaler = MinMaxScaler()
# fit and transform the data
data_normal = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [None]:
data_normal

# Visualização
-- https://mode.com/blog/violin-plot-examples/


In [None]:
def violin_plot(beginning,end):
    data = pd.concat([y,data_normal.iloc[:,beginning:end]],axis=1)
    data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name='value')
    plt.figure(figsize=(10,10))
    sns.violinplot(x="features", y="value", hue="diagnosis", data=data, split=True, inner="box")
    plt.xticks(rotation=45)

In [None]:
# first ten features
violin_plot(0,10)

In [None]:
# Second ten features
violin_plot(10,20)

In [None]:
# Third ten features
violin_plot(20,31)

In [None]:
data_normal.columns

In [None]:
def joint_plot(feature1, feature2):
    """ I have a FutureWarning on this function, anyone knows how to get rid of it ?"""
    sns.jointplot(x.loc[:,feature1], x.loc[:,feature2], kind="regg", color="#ce1414")

In [None]:
joint_plot("radius_worst","perimeter_worst")
#joint_plot("radius_worst","smoothness_worst")

In [None]:
joint_plot("concavity_worst","concave points_worst")

## Mapa de calor com a correlação entre os atributos

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

# Seleção de características e Aprendizado de Máquina


In this section, we will select feature with different methods that are feature selection with correlation, recursive feature elimination (RFE) and recursive feature elimination with cross validation (RFECV). We will use the Random Forest classification to train our model. 

## 1) Feature Selection with correlation and Random Forest classification

**Compactness_mean, concavity_mean and concave points_mean** are correlated -> I choose **concavity_mean**.

**radius_se, perimeter_se and area_se** are correlated -> I choose  **area_se**.

**radius_worst, perimeter_worst and area_worst** are correlated -> I choose  **area_worst**.

**Compactness_worst, concavity_worst and concave points_worst** are correlated -> I choose  **concavity_worst**.

**Compactness_se, concavity_se and concave points_se** are correlated -> I choose  **concavity_se**.

**texture_mean and texture_worst are correlated** are correlated -> I choose  **texture_mean**.

**area_worst and area_mean** are correlated -> I choose  **area_mean**.




In [None]:
drop_list = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = x.drop(drop_list,axis = 1 )        # do not modify x, we will use it later 
x_1.head()

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

### Transformação dos dados categóricos para numérico

-- https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import LabelEncoder

#Binary format, with B=0 and M=1
#y = y.replace("B", 0)
#y = y.replace("M", 1)

# Para obter as classes como inteiros, utilizamos
# a classe LabelEncoder da scikit-learn
le = LabelEncoder()
y = le.fit_transform(y)

# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(data_normal, y, test_size=0.3, random_state=42)

#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=43)      
clr_rf = clf_rf.fit(x_train,y_train)

recall = recall_score(y_test,clf_rf.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf_rf.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

## 2) Recursive feature elimination (RFE) with Random Forest

RFE uses one of the classification methods (random forest in our example), assign weights to each feature. Whose absolute weights are the smallest are pruned from the current set of features. That procedure is recursively repeated on the pruned set until the desired number of features is reached.

In [None]:
from sklearn.feature_selection import RFE

# split data train 70 % and test 30, this time with x and not x_1 in order to have all the features %
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

# Create the RFE object and rank each pixel
clf_rf_2 = RandomForestClassifier(random_state=50)      
rfe = RFE(estimator=clf_rf_2, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)

In [None]:
print('Atributos abordagem rfe:',x_train.columns[rfe.support_])

In [None]:
type(x_train.columns[rfe.support_])

Chosen 16 best features by rfe are different than the one obtained with the previous naive method. Therefore we do need to calculate recall again.

In [None]:
recall = recall_score(y_test,rfe.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,rfe.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,rfe.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,rfe.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

## 3) Recursive feature elimination with cross validation and Random Forest classification


In [None]:
from sklearn.feature_selection import RFECV

clf_rf_3 = RandomForestClassifier(random_state=43) 
rfecv = RFECV(estimator=clf_rf_3, step=1, cv=5, scoring='recall')   #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])

In [None]:
type(x_train.columns[rfecv.support_])

In [None]:
# Plot number of features VS. cross-validation scores
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

## 4) Utilizando abordagem Filtro

In [None]:
# Import the necessary libraries first
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#Feature selection with Mutual Information 
import numpy
import pandas
from pandas.plotting import scatter_matrix
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif

# split data train 70 % and test 30 %
#x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(data_normal, y, test_size=0.3, random_state=40)

#selector = SelectKBest(score_func=chi2, k=3)
selector = SelectKBest(score_func=mutual_info_classif, k=5)
#selector = SelectKBest(score_func=f_classif, k=42)

#selector.fit(data_normal, y)
selector.fit(x_train, y_train)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
print(cols)

# print the scores
#for i in range(len(selector.scores_)):
 #   print('Feature %d: %f' % (i, selector.scores_[i]))


# plot the scores
'''
plt.bar([i for i in range(len(fit.scores_))], fit.scores_)
plt.show()
'''

In [None]:
# to remove the rest of the features:

x_train = selector.transform(x_train)
x_test = selector.transform(x_test)

In [None]:
#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=50)      
clr_rf = clf_rf.fit(x_train,y_train)

recall = recall_score(y_test,clf_rf.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf_rf.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

### Nome dos atributos

In [None]:
colname_Filter = data_normal.columns[cols]
print (colname_Filter)


In [None]:
type(colname_Filter)

### Avaliação entre as features (Filter x Wrapper)

In [None]:
wrapperFeatures = x_train.columns[rfecv.support_]

In [None]:
set(wrapperFeatures.difference(colname_Filter))

In [None]:
wrapperFeatures.equals(colname_Filter)

In [None]:
set(colname_Filter.difference(wrapperFeatures))

In [None]:
len(colname_Filter)

In [None]:
len(wrapperFeatures)

In [None]:
set(wrapperFeatures) - set(colname_Filter)


In [None]:
set(wrapperFeatures).intersection(colname_Filter)


In [None]:
set(colname_Filter).intersection(wrapperFeatures)


In [None]:
set(colname_Filter) ^ set(wrapperFeatures)


# Atividade Prática

- Base: https://www.kaggle.com/houcembenmansour/predict-diabetes-based-on-diagnostic-measures

- Avaliar os atributos a partir do gráfico violin plot
- Heatmap

- Desenvolver um modelo preditivo baseado em Randon Forest com todos os atributos

- Avaliação das features
- RFE (Wrapper)
- Mutual information

- Apresentar o conjunto mínimo com perda entre 1-2% das métricas com releação à todos atributos
- Mostrar os atributos diferentes (iguais) entre mutual information e RFE

