In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [44]:
cols = ["ID","cThick","UCSize", "UCShape", "Adhesion", "CECSize", "Bare", "Bland", "Normal", "Mitoses","class"]
df = pd.read_csv("breast-cancer-wisconsin.data", names=cols)
df.drop("Bare", inplace=True, axis=1)
df.head()

Unnamed: 0,ID,cThick,UCSize,UCShape,Adhesion,CECSize,Bland,Normal,Mitoses,class
0,1000025,5,1,1,1,2,3,1,1,2
1,1002945,5,4,4,5,7,3,2,1,2
2,1015425,3,1,1,1,2,3,1,1,2
3,1016277,6,8,8,1,3,3,7,1,2
4,1017023,4,1,1,3,2,3,1,1,2


In [47]:
train, valid, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

  return bound(*args, **kwds)


In [48]:
def scale_dataset(dataframe, oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    if oversample:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x, y)
        
    x = scaler.fit_transform(x)
    data = np.hstack((x,np.reshape(y, (-1,1))))
    return data, x, y

In [49]:
train, X_train, Y_train = scale_dataset(train, oversample=True)
valid, X_valid, Y_valid = scale_dataset(valid, oversample=False)
test, X_test, Y_test = scale_dataset(test, oversample=False)

# Importing ML algorithms and Classification Report

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# kNN

In [34]:
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(X_train, Y_train)
Y_pred = knn_model.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           2       1.00      0.92      0.96        98
           4       0.84      1.00      0.91        42

    accuracy                           0.94       140
   macro avg       0.92      0.96      0.94       140
weighted avg       0.95      0.94      0.94       140



# Naive Bayes

In [38]:
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)
Y_pred = nb_model.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           2       1.00      0.89      0.94        98
           4       0.79      1.00      0.88        42

    accuracy                           0.92       140
   macro avg       0.90      0.94      0.91       140
weighted avg       0.94      0.92      0.92       140



# Logistic Regression

In [39]:
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)
Y_pred = lr_model.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           2       1.00      0.91      0.95        98
           4       0.82      1.00      0.90        42

    accuracy                           0.94       140
   macro avg       0.91      0.95      0.93       140
weighted avg       0.95      0.94      0.94       140



# SVC

In [41]:
svc_model = SVC()
svc_model.fit(X_train, Y_train)
Y_pred = nb_model.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           2       1.00      0.89      0.94        98
           4       0.79      1.00      0.88        42

    accuracy                           0.92       140
   macro avg       0.90      0.94      0.91       140
weighted avg       0.94      0.92      0.92       140

