In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [6]:
df = pd.read_csv('data/breast-cancer-wisconsin.data', header=None)

In [7]:
import pyperclip as ppc

In [8]:
pd.Series(ppc.paste().split('\n')[2:]).apply(lambda x: x.split('.')[1].strip().split('  ')[0]).to_clipboard(header=False, index=False)

In [9]:
df.columns = ppc.paste().split('\r\n')[:-1]

In [10]:
df.drop(df.columns[0], axis=1, inplace=True)

In [11]:
df.dtypes

Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class:                          int64
dtype: object

In [12]:
df.drop(df[(df == '?').sum(axis=1).astype(bool)].index).astype(int)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class:
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [13]:
df[(df == '?').sum(axis=1).astype(bool)].index

Int64Index([23, 40, 139, 145, 158, 164, 235, 249, 275, 292, 294, 297, 315, 321,
            411, 617],
           dtype='int64')

In [16]:
df = df.drop(df[(df == '?').sum(axis=1).astype(bool)].index).astype(int)

In [76]:
px.imshow(df.corr())

In [17]:
df.dtypes

Clump Thickness                int32
Uniformity of Cell Size        int32
Uniformity of Cell Shape       int32
Marginal Adhesion              int32
Single Epithelial Cell Size    int32
Bare Nuclei                    int32
Bland Chromatin                int32
Normal Nucleoli                int32
Mitoses                        int32
Class:                         int32
dtype: object

In [18]:
X = df.drop('Class:', axis=1)
y = df['Class:']

In [32]:
from sklearn.model_selection import train_test_split, KFold

# Varying train-test ratio

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [23]:
nb = CategoricalNB()
kn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

print('Tets Size 25%')
nb.fit(X_train, y_train)
print('Naive-Bayes Classifier Accuracy: {:>52.3f}%'.format(nb.score(X_test, y_test)*100))

kn.fit(X_train, y_train)
print('K-Nearest Neighbors Classifier Accuracy: {:>44.3f}%'.format(kn.score(X_test, y_test)*100))

dt.fit(X_train, y_train)
print('Decision Tree Neighbors Classifier Accuracy: {:>40.3f}%'.format(dt.score(X_test, y_test)*100))

rf.fit(X_train, y_train)
print('Random Forest Neighbors Classifier Accuracy: {:>40.3f}%'.format(rf.score(X_test, y_test)*100))

Tets Size 25%
Naive-Bayes Classifier Accuracy:                                               97.076%
K-Nearest Neighbors Classifier Accuracy:                                       97.076%
Decision Tree Neighbors Classifier Accuracy:                                   93.567%
Random Forest Neighbors Classifier Accuracy:                                   96.491%


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [25]:
nb = CategoricalNB()
kn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

nb.fit(X_train, y_train)
print('Naive-Bayes Classifier Accuracy: {:>52.3f}%'.format(nb.score(X_test, y_test)*100))

kn.fit(X_train, y_train)
print('K-Nearest Neighbors Classifier Accuracy: {:>44.3f}%'.format(kn.score(X_test, y_test)*100))

dt.fit(X_train, y_train)
print('Decision Tree Neighbors Classifier Accuracy: {:>40.3f}%'.format(dt.score(X_test, y_test)*100))

rf.fit(X_train, y_train)
print('Random Forest Neighbors Classifier Accuracy: {:>40.3f}%'.format(rf.score(X_test, y_test)*100))

Naive-Bayes Classifier Accuracy:                                               96.053%
K-Nearest Neighbors Classifier Accuracy:                                       95.614%
Decision Tree Neighbors Classifier Accuracy:                                   93.860%
Random Forest Neighbors Classifier Accuracy:                                   96.053%


# Varying splitting method

In [56]:
nb = CategoricalNB()
kn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

k=20
nba = []
for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    nb.fit(X_train, y_train)
    nba.append(nb.score(X_test, y_test))
print('Naive-Bayes Classifier Accuracy: {:>52.3f}%'.format(np.mean(nba)*100))

kna = []
for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    kn.fit(X_train, y_train)
    kna.append(kn.score(X_test, y_test))
print('K-Nearest Neighbors Classifier Accuracy: {:>44.3f}%'.format(np.mean(kna)*100))

dta = []
for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    dt.fit(X_train, y_train)
    dta.append(dt.score(X_test, y_test))
print('Decision Tree Neighbors Classifier Accuracy: {:>40.3f}%'.format(np.mean(dta)*100))

rfa = []
for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    rf.fit(X_train, y_train)
    rfa.append(rf.score(X_test, y_test))
print('Random Forest Neighbors Classifier Accuracy: {:>40.3f}%'.format(np.mean(rfa)*100))

Naive-Bayes Classifier Accuracy:                                               97.000%
K-Nearest Neighbors Classifier Accuracy:                                       96.780%
Decision Tree Neighbors Classifier Accuracy:                                   93.366%
Random Forest Neighbors Classifier Accuracy:                                   97.171%


In [57]:
train, test = list(kf.split(X, y))[0]

In [58]:
X.iloc[train]

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
71,6,10,2,8,10,2,7,8,10
72,1,3,3,2,2,1,7,2,1
73,9,4,5,10,6,10,4,8,1
74,10,6,4,1,3,4,3,2,3
75,1,1,2,1,2,2,4,2,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [59]:
X.iloc[test]

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
66,4,1,1,1,2,1,3,1,1
67,5,3,4,1,8,10,4,9,1
68,8,3,8,3,4,9,8,9,8
69,1,1,1,1,2,1,3,2,1


In [60]:
k = 10
kf = KFold(k)

nba = []
for train, test in kf.split(X, y):
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    nb.fit(X_train, y_train)
    nba.append(nb.score(X_test, y_test))
print('Naive-Bayes Classifier Accuracy: {:>52.3f}%'.format(np.mean(nba)*100))

kna = []
for train, test in kf.split(X, y):
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    kn.fit(X_train, y_train)
    kna.append(kn.score(X_test, y_test))
print('K-Nearest Neighbors Classifier Accuracy: {:>44.3f}%'.format(np.mean(kna)*100))

dta = []
for train, test in kf.split(X, y):
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    dt.fit(X_train, y_train)
    dta.append(dt.score(X_test, y_test))
print('Decision Tree Neighbors Classifier Accuracy: {:>40.3f}%'.format(np.mean(dta)*100))

rfa = []
for train, test in kf.split(X, y):
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    rf.fit(X_train, y_train)
    rfa.append(rf.score(X_test, y_test))
print('Random Forest Neighbors Classifier Accuracy: {:>40.3f}%'.format(np.mean(rfa)*100))

Naive-Bayes Classifier Accuracy:                                               97.361%
K-Nearest Neighbors Classifier Accuracy:                                       96.931%
Decision Tree Neighbors Classifier Accuracy:                                   93.274%
Random Forest Neighbors Classifier Accuracy:                                   96.782%


# Standardization of Data

In [61]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [62]:
sc = StandardScaler()
mms = MinMaxScaler()

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [64]:
sX_train, sX_test = sc.fit_transform(X_train), sc.transform(X_test)

In [65]:
mX_train, mX_test = mms.fit_transform(X_train), mms.transform(X_test)

In [67]:
nb = CategoricalNB()
kn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

print('Standard Scaling')
# nb.fit(sX_train, y_train)
# print('Naive-Bayes Classifier Accuracy: {:>52.3f}%'.format(nb.score(X_test, y_test)*100))

kn.fit(sX_train, y_train)
print('K-Nearest Neighbors Classifier Accuracy: {:>44.3f}%'.format(kn.score(sX_test, y_test)*100))

dt.fit(sX_train, y_train)
print('Decision Tree Neighbors Classifier Accuracy: {:>40.3f}%'.format(dt.score(sX_test, y_test)*100))

rf.fit(sX_train, y_train)
print('Random Forest Neighbors Classifier Accuracy: {:>40.3f}%'.format(rf.score(sX_test, y_test)*100))

Standard Scaling
K-Nearest Neighbors Classifier Accuracy:                                       32.683%
Decision Tree Neighbors Classifier Accuracy:                                   32.683%
Random Forest Neighbors Classifier Accuracy:                                   32.683%


In [74]:
nb = CategoricalNB()
kn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

print('Min Max Scaling')
# nb.fit(mX_train, y_train)
# print('Naive-Bayes Classifier Accuracy: {:>52.3f}%'.format(nb.score(X_test, y_test)*100))

kn.fit(mX_train, y_train)
print('K-Nearest Neighbors Classifier Accuracy: {:>44.3f}%'.format(kn.score(X_test, y_test)*100))

dt.fit(mX_train, y_train)
print('Decision Tree Neighbors Classifier Accuracy: {:>40.3f}%'.format(dt.score(X_test, y_test)*100))

rf.fit(mX_train, y_train)
print('Random Forest Neighbors Classifier Accuracy: {:>40.3f}%'.format(rf.score(X_test, y_test)*100))

Min Max Scaling
K-Nearest Neighbors Classifier Accuracy:                                       32.683%
Decision Tree Neighbors Classifier Accuracy:                                   32.683%
Random Forest Neighbors Classifier Accuracy:                                   32.683%
