In [34]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron

In [2]:
input_file = "data/breastcancer.csv"
df = pd.read_csv(input_file, header=0)
df.head()

Unnamed: 0,STK35,DPYSL4,GJC2,FMNL1,LIG3,CA6,BRPF1,BRMS1,CXCL13,SIAE,...,ADRB1,DRD4,GABRR1,KRT10,PIP,MS4A3,SECISBP2L,EMP3,HOXB1,Class
0,0.425,-0.316,0.465,-0.142,-0.033,0.357,-0.197,-0.28,3.164,-0.086,...,-0.188,-0.015,-0.035,0.722,0.131,-0.545,0.323,-0.4,0.064,Basal
1,0.694,-0.497,0.2,-0.215,0.852,0.3,-0.864,-0.222,6.179,-0.649,...,0.2,-0.416,0.293,-0.795,1.785,-1.224,0.505,-0.275,0.3,Basal
2,0.304,-0.205,0.739,0.176,0.726,0.715,0.298,0.147,3.612,-0.23,...,0.211,0.17,0.156,-0.733,2.414,-0.435,0.047,-0.98,0.233,Basal
3,0.309,-0.561,0.669,-0.514,0.805,0.409,-0.974,0.054,1.28,-0.15,...,-0.532,0.884,0.633,-0.586,0.163,-0.987,0.223,-1.09,0.603,Basal
4,0.732,-0.287,0.311,-0.326,-0.468,1.887,-0.347,-0.193,2.41,0.479,...,0.354,0.596,0.27,-0.282,0.601,0.164,0.04,-0.705,0.054,Basal


In [3]:
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [4]:
df.head()

Unnamed: 0,STK35,DPYSL4,GJC2,FMNL1,LIG3,CA6,BRPF1,BRMS1,CXCL13,SIAE,...,ADRB1,DRD4,GABRR1,KRT10,PIP,MS4A3,SECISBP2L,EMP3,HOXB1,Class
0,128,46,73,81,32,120,107,33,103,54,...,45,8,26,144,5,51,95,135,58,0
1,134,20,27,63,135,109,8,42,156,6,...,104,2,111,82,46,4,117,140,111,0
2,113,70,114,118,124,138,142,90,119,32,...,106,12,69,92,54,69,38,61,98,0
3,114,14,104,15,130,124,2,76,39,40,...,12,71,134,110,6,11,75,45,137,0
4,135,49,43,43,5,145,83,45,81,121,...,125,43,102,128,27,141,33,100,56,0


In [5]:
X = df.drop(columns=['Class'], axis=1)
y = df['Class']

In [6]:
def confusion_matrix_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    return {'TN': cm[0, 0], 'FP': cm[0, 1], 'FN': cm[1, 0], 'TP': cm[1, 1]
        , 'PPV': np.divide(cm[1, 1], cm[1, 1] + cm[0, 1], where=cm[1, 1] + cm[0, 1] != 0)
        , 'NPV': np.divide(cm[0, 0], cm[0, 0] + cm[1, 0], where=cm[0, 0] + cm[1, 0] != 0)
        , 'Sensitivity': np.divide(cm[1, 1], cm[1, 1] + cm[1, 0], where=cm[1, 1] + cm[1, 0] != 0)
        , 'Specificity': np.divide(cm[0, 0], cm[0, 0] + cm[0, 1], where=cm[0, 0] + cm[0, 1] != 0)
        , 'Accuracy': np.divide(cm[1, 1] + cm[0, 0], cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1],
                                where=cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1] != 0)}

In [7]:
def cross_val_show_result(clf, X, y, k):
    result = cross_validate(clf, X, y, cv=k, scoring=confusion_matrix_scorer)
    print(np.mean(result['test_PPV']))
    print(np.mean(result['test_NPV']))
    print(np.mean(result['test_Sensitivity']))
    print(np.mean(result['test_Specificity']))
    print(np.mean(result['test_Accuracy']))

In [142]:
mlp = MLPClassifier(random_state=1, max_iter=300)

In [143]:
cross_val_show_result(mlp, X, y, 10)

0.9066666666666666
1.1
1.0
0.8666666666666666
0.9199999999999999


In [37]:
randomF = RandomForestClassifier()

In [138]:
cross_val_show_result(randomF, X, y, 10)

1.0
0.9800000000000001
0.9666666666666666
1.0
0.9857142857142858


In [39]:
svm = svm.SVC(kernel="rbf")

In [139]:
cross_val_show_result(svm, X, y, 10)

0.8
0.9800000000000001
1.5666666666666667
1.0
0.9857142857142858


# Feature Selection

# Information Gain

In [118]:
sel_five_cols = SelectKBest(mutual_info_classif, k=11)
sel_five_cols.fit(X, y)
X.columns[sel_five_cols.get_support()]

Index(['AGR3', 'FOXA1', 'THSD4', 'TBC1D9', 'PPP1R14C', 'CENPF', 'GATA3',
       'AGR2', 'ATL2', 'AR', 'TFF3'],
      dtype='object')

In [119]:
removal_info_gain = X.columns[sel_five_cols.get_support()].values.tolist()

In [120]:
X_info_gain = df.filter(removal_info_gain)

In [144]:
cross_val_show_result(mlp, X_info_gain, y, 10)



1.0
1.0
1.0
1.0
1.0




In [141]:
cross_val_show_result(randomF, X_info_gain, y, 10)

1.0
1.0
1.0
1.0
1.0


In [123]:
cross_val_show_result(svm, X_info_gain, y, 10)

1.0
0.9761904761904762
0.9545454545454546
1.0
0.9838709677419355


# Chi

In [106]:
selector_chi = SelectKBest(chi2, k=11)
selector_chi.fit(X, y)
X.columns[selector_chi.get_support()]

Index(['AGR3', 'ASPM', 'FOXA1', 'CA12', 'FAM174A', 'THSD4', 'TBC1D9', 'CENPF',
       'GATA3', 'IL6ST', 'SLC7A8'],
      dtype='object')

In [107]:
removal_chi = X.columns[selector_chi.get_support()].values.tolist()

In [108]:
X_chi = df.filter(removal_chi)

In [109]:
X_chi

Unnamed: 0,AGR3,ASPM,FOXA1,CA12,FAM174A,THSD4,TBC1D9,CENPF,GATA3,IL6ST,SLC7A8
0,38,128,29,16,30,5,44,122,5,27,10
1,16,143,5,28,87,10,52,134,34,49,61
2,11,132,39,32,20,21,24,136,22,11,6
3,42,131,16,18,42,8,34,131,20,35,54
4,49,121,15,8,61,42,38,114,14,21,33
...,...,...,...,...,...,...,...,...,...,...,...
153,65,27,54,47,12,113,57,47,58,118,41
154,69,17,50,88,59,104,74,6,48,110,65
155,80,11,55,97,67,122,92,10,78,121,83
156,63,69,58,69,98,68,67,40,92,33,89


In [145]:
cross_val_show_result(mlp, X_chi, y, 2)



0.9090909090909092
0.9472222222222222
0.8944444444444444
0.9473684210526316
0.9304187192118227




In [125]:
cross_val_show_result(randomF, X_chi, y, 2)

1.0
1.0
1.0
1.0
1.0


In [126]:
cross_val_show_result(svm, X_chi, y, 2)

1.0
0.9273809523809524
0.8545454545454545
1.0
0.9488505747126437
