## Importing the libraries

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# import pathlib
# pathlib.Path().resolve()

In [2]:
dataset = pd.read_csv('./Datasets/Breastcancer.csv')
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
dataset.shape

(158, 13583)

## Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, :-1])
X[:, :-1] = imputer.transform(X[:, :-1])

In [6]:
print(X)

[[ 0.425 -0.316  0.465 ...  0.323 -0.4    0.064]
 [ 0.694 -0.497  0.2   ...  0.505 -0.275  0.3  ]
 [ 0.304 -0.205  0.739 ...  0.047 -0.98   0.233]
 ...
 [ 0.041  0.038  0.774 ...  0.298 -0.679  0.099]
 [ 0.139 -0.353  0.643 ... -0.144 -0.467  0.344]
 [-0.04   0.01   1.086 ...  1.654 -1.363  0.28 ]]


### Encoding the Dependent Variable

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4
 4 4 4 4 4 4 4 4 4 4]


### Min-Max scaling

In [9]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)

### Chi-square feature selection

In [10]:
from sklearn.feature_selection import chi2

In [11]:
chi_scores = chi2(X_train_minmax,y)
print(chi_scores)

(array([0.56288441, 0.36218066, 0.54701439, ..., 0.54502882, 0.82407793,
       0.06101989]), array([0.96709743, 0.98545451, 0.96876566, ..., 0.96897195, 0.93519124,
       0.99954393]))


In [12]:
X = dataset.drop('Class',axis=1)
y = dataset['Class']
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)

In [13]:
print(y)

0       Basal
1       Basal
2       Basal
3       Basal
4       Basal
        ...  
153    Normal
154    Normal
155    Normal
156    Normal
157    Normal
Name: Class, Length: 158, dtype: object


In [14]:
p_values.plot.bar()
# This will take time to run, so be patient or comment it out as it's just a graph

In [15]:
from sklearn.feature_selection import SelectKBest

In [44]:
chi2_selector = SelectKBest(chi2, k=500)
X_kbest = chi2_selector.fit_transform(X_train_minmax, y)
print(X_kbest)

[[0.15244293 0.70728291 0.15461672 ... 0.21867928 0.16934932 0.02485236]
 [0.09867719 0.49789916 0.13937282 ... 0.22155935 0.09434932 0.22834646]
 [0.08118199 0.92950514 0.2158101  ... 0.29623534 0.14537671 0.30573327]
 ...
 [0.59409004 0.86741363 0.42857143 ... 0.29993828 0.48202055 0.83304626]
 [0.37177299 0.36671335 0.50261324 ... 0.25097717 0.4744863  0.94291339]
 [0.41263068 0.67553688 0.41093206 ... 0.34211068 0.50325342 0.95607776]]


In [46]:
print('Original number of features:', X.shape)
print('Reduced number of features:', X_kbest.shape)

Original number of features: (158, 13582)
Reduced number of features: (158, 500)


### SVM with rbf kernel

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size = 0.2, random_state = 42)

In [48]:
print(y_test.unique())

['LumB' 'Her2' 'Normal' 'LumA' 'Basal']


In [49]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 42, decision_function_shape='ovr')
classifier.fit(X_train, y_train)

SVC(random_state=42)

In [1]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

NameError: name 'classifier' is not defined

### 10-fold cross validation

In [51]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 88.72 %
Standard Deviation: 7.68 %




### Metrics calculation

In [52]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

print(FP)
print(FN)
print(TP)
print(TN)



[0 2 2 0 1]
[0 0 1 2 2]
[9 3 9 4 2]
[23 27 20 26 27]


In [53]:
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)


In [54]:
print("PPV:{}\nNPV:{}\nSensitivity:{}\nSpecificity:{}".format(PPV, NPV, TPR, TNR))
print('\n')
print("Accuracy:",ACC)

PPV:[1.         0.6        0.81818182 1.         0.66666667]
NPV:[1.         1.         0.95238095 0.92857143 0.93103448]
Sensitivity:[1.         1.         0.9        0.66666667 0.5       ]
Specificity:[1.         0.93103448 0.90909091 1.         0.96428571]


Accuracy: [1.      0.9375  0.90625 0.9375  0.90625]


In [55]:
avg_PPV  = np.average(PPV, axis=None, weights=None, returned=False)
avg_NPV  = np.average(NPV, axis=None, weights=None, returned=False)
avg_TPR  = np.average(TPR, axis=None, weights=None, returned=False)
avg_TNR  = np.average(TNR, axis=None, weights=None, returned=False)
avg_ACC  = np.average(ACC, axis=None, weights=None, returned=False)
print("PPV:{:.2f}\nNPV:{:.2f}\nSensitivity:{:.2f}\nSpecificity:{:.2f}\nAccuracy:{:.2f}".format(avg_PPV, avg_NPV, avg_TPR, avg_TNR, avg_ACC))

PPV:0.82
NPV:0.96
Sensitivity:0.81
Specificity:0.96
Accuracy:0.94
