## Comparing accuracy scores and k fold for all models

### Data preprocessing

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
dataset = pd.read_csv('breast_cancer.csv')
dataset = dataset.drop("Sample code number", axis='columns')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
dataset

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
678,3,1,1,1,3,2,1,1,1,2
679,2,1,1,1,2,1,1,1,1,2
680,5,10,10,3,7,3,8,10,2,4
681,4,8,6,4,3,4,10,6,1,4


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

### Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression()
classifier_lr.fit(X_train, y_train)

In [9]:
y_pred_lr = classifier_lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
cm_lr = confusion_matrix(y_test, y_pred_lr)

In [10]:
accuracies_lr = cross_val_score(estimator = classifier_lr, X = X_train, y = y_train, cv = 10)
kf_acc_lr = accuracies_lr.mean().round(4)*100
std_lr = accuracies_lr.std().round(4)*100

### KNN 

In [11]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

In [12]:
y_pred_knn = classifier_knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
cm_knn = confusion_matrix(y_test, y_pred_knn)

In [13]:
accuracies_knn = cross_val_score(estimator = classifier_knn, X = X_train, y = y_train, cv = 10)
kf_acc_knn = accuracies_knn.mean().round(4)*100
std_knn = accuracies_knn.std().round(4)*100

### Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)

In [15]:
y_pred_nb = classifier_nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)
cm_nb = confusion_matrix(y_test, y_pred_nb)

In [None]:
accuracies_nb = cross_val_score(estimator = classifier_nb, X = X_train, y = y_train, cv = 10)
kf_acc_nb = accuracies_nb.mean().round(4)*100
std_nb = accuracies_nb.std().round(4)*100

### SVC

In [17]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear', random_state = 0)
classifier_svc.fit(X_train, y_train)

In [18]:
y_pred_svc = classifier_svc.predict(X_test)
acc_svc = accuracy_score(y_test, y_pred_svc)
cm_svc = confusion_matrix(y_test, y_pred_svc)

In [19]:
accuracies_svc = cross_val_score(estimator = classifier_svc, X = X_train, y = y_train, cv = 10)
kf_acc_svc = accuracies_svc.mean().round(4)*100
std_svc = accuracies_svc.std().round(4)*100

### Kernel SVM

In [20]:
from sklearn.svm import SVC
classifier_ksvm = SVC(kernel = 'rbf', random_state = 0)
classifier_ksvm.fit(X_train, y_train)

In [21]:
y_pred_ksvm = classifier_ksvm.predict(X_test)
acc_ksvm = accuracy_score(y_test, y_pred_ksvm)
cm_ksvm = confusion_matrix(y_test, y_pred_ksvm)

In [22]:
accuracies_ksvm = cross_val_score(estimator = classifier_ksvm, X = X_train, y = y_train, cv = 10)
kf_acc_ksvm = accuracies_ksvm.mean().round(4)*100
std_ksvm = accuracies_ksvm.std().round(4)*100

### Descision tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

In [24]:
y_pred_dt = classifier_dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
cm_dt = confusion_matrix(y_test, y_pred_dt)

In [25]:
accuracies_dt = cross_val_score(estimator = classifier_dt, X = X_train, y = y_train, cv = 10)
kf_acc_dt = accuracies_dt.mean().round(4)*100
std_dt = accuracies_dt.std().round(4)*100

### Random forest classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

In [27]:
y_pred_rf = classifier_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

In [28]:
accuracies_rf = cross_val_score(estimator = classifier_rf, X = X_train, y = y_train, cv = 10)
kf_acc_rf = accuracies_rf.mean().round(4)*100
std_rf = accuracies_rf.std().round(4)*100

### Comparing accuracies

In [30]:
from tabulate import tabulate
table = [["Model", "Accuracy", "Confusion matrix", "Kfold accuracy", "Kfold std"],
        ["Logistic regression", acc_lr*100, cm_lr, kf_acc_lr, std_lr],
        ["K Nearest Neighbours", acc_knn*100, cm_knn, kf_acc_knn, std_knn],
        ["Naive Bayes", acc_nb*100, cm_nb, kf_acc_nb, std_nb],
        ["SVC", acc_svc*100, cm_svc, kf_acc_svc, std_svc],
        ["Kernel SVM", acc_ksvm*100, cm_ksvm, kf_acc_ksvm, std_ksvm],
        ["Descision tree", acc_dt*100, cm_dt, kf_acc_dt, std_dt],
        ["Random forest", acc_rf*100, cm_rf, kf_acc_rf, std_rf]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒══════════════════════╤════════════╤════════════════════╤══════════════════╤═════════════╕
│ Model                │   Accuracy │ Confusion matrix   │   Kfold accuracy │   Kfold std │
╞══════════════════════╪════════════╪════════════════════╪══════════════════╪═════════════╡
│ Logistic regression  │    94.7368 │ [[103   4]         │            96.68 │        1.53 │
│                      │            │  [  5  59]]        │                  │             │
├──────────────────────┼────────────┼────────────────────┼──────────────────┼─────────────┤
│ K Nearest Neighbours │    94.7368 │ [[103   4]         │            97.27 │        2.17 │
│                      │            │  [  5  59]]        │                  │             │
├──────────────────────┼────────────┼────────────────────┼──────────────────┼─────────────┤
│ Naive Bayes          │    94.152  │ [[99  8]           │            96.88 │        2.17 │
│                      │            │  [ 2 62]]          │                  │   