## 1.1 Importing the libraries

In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

## 1.2 Reading the preprocessed data

In [2]:
X=pd.read_csv('BreastCancerPreprocessedDataX.csv')
y=pd.read_csv('BreastCancerPreprocessedDatay.csv',names=['diagnosis'])
X.drop(columns=['Unnamed: 0'], inplace=True)
X.head()

Unnamed: 0,radius_mean,texture_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se
0,0.521037,0.022658,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,0.356147,0.120469,0.159296,0.351398,0.135682,0.300625,0.311645,0.183042
1,0.643144,0.272574,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,0.156437,0.082589,0.119387,0.081323,0.04697,0.253836,0.084539,0.09111
2,0.601496,0.39026,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,0.229622,0.094303,0.150831,0.283955,0.096768,0.389847,0.20569,0.127006
3,0.21009,0.360839,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,0.139091,0.175875,0.251453,0.543215,0.142955,0.353665,0.728148,0.287205
4,0.629893,0.156578,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,0.233822,0.093065,0.332359,0.167918,0.143636,0.357075,0.136179,0.1458


Replacing the labels with 0 and 1 ('M' with 0 and 'B' with 1)

In [3]:
y.loc[y.diagnosis=='M']=0
y.loc[y.diagnosis=='B']=1
y.head()

Unnamed: 0,diagnosis
0,0
1,0
2,0
3,0
4,0


## 1.3 Splitting the data (70% training and 30% test data)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

## 2.1 Using Logistic Regression for prediction

In [5]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## 2.2 Results

In [6]:
y_pred=model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

cfm = confusion_matrix(y_test, y_pred)

true_negative = cfm[0][0]
false_positive = cfm[0][1]
false_negative = cfm[1][0]
true_positive = cfm[1][1]

print('Confusion Matrix: \n', cfm, '\n')

print('True Negative:', true_negative)
print('False Positive:', false_positive)
print('False Negative:', false_negative)
print('True Positive:', true_positive)
print('Correct Predictions', 
      round((true_negative + true_positive) / len(y_pred) * 100, 1), '%')

              precision    recall  f1-score   support

           0      0.962     0.911     0.936        56
           1      0.958     0.983     0.970       115

   micro avg      0.959     0.959     0.959       171
   macro avg      0.960     0.947     0.953       171
weighted avg      0.959     0.959     0.959       171

Confusion Matrix: 
 [[ 51   5]
 [  2 113]] 

True Negative: 51
False Positive: 5
False Negative: 2
True Positive: 113
Correct Predictions 95.9 %


## 3.1 Using K Nearest Neighbors (KNN) Algorithm for prediction

Finding the best value of k

In [7]:
for i in range(1,20):
    knn=KNeighborsClassifier(n_neighbors=i)
    model=knn.fit(X_train, y_train.values.ravel())
    y_pred=model.predict(X_test)
    w=accuracy_score(y_test.values.ravel(), y_pred.ravel())
    print(i)
    print(w)

1
0.9649122807017544
2
0.9415204678362573
3
0.9590643274853801
4
0.9415204678362573
5
0.9590643274853801
6
0.9473684210526315
7
0.9473684210526315
8
0.9532163742690059
9
0.9532163742690059
10
0.9532163742690059
11
0.9707602339181286
12
0.9532163742690059
13
0.9649122807017544
14
0.9590643274853801
15
0.9649122807017544
16
0.9532163742690059
17
0.9590643274853801
18
0.9590643274853801
19
0.9649122807017544


We get the best accuracy for k=11.

In [8]:
knn=KNeighborsClassifier(n_neighbors=i)
model=knn.fit(X_train, y_train.values.ravel())

## 3.2 Results

In [9]:
y_pred=model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

cfm = confusion_matrix(y_test, y_pred)

true_negative = cfm[0][0]
false_positive = cfm[0][1]
false_negative = cfm[1][0]
true_positive = cfm[1][1]

print('Confusion Matrix: \n', cfm, '\n')

print('True Negative:', true_negative)
print('False Positive:', false_positive)
print('False Negative:', false_negative)
print('True Positive:', true_positive)
print('Correct Predictions', 
      round((true_negative + true_positive) / len(y_pred) * 100, 1), '%')

              precision    recall  f1-score   support

           0      0.931     0.964     0.947        56
           1      0.982     0.965     0.974       115

   micro avg      0.965     0.965     0.965       171
   macro avg      0.957     0.965     0.961       171
weighted avg      0.966     0.965     0.965       171

Confusion Matrix: 
 [[ 54   2]
 [  4 111]] 

True Negative: 54
False Positive: 2
False Negative: 4
True Positive: 111
Correct Predictions 96.5 %


## 4.1 Using Support Vector Machines (SVM) Algorithm for prediction

In [10]:
model=svm.SVC(gamma='auto')
model.fit(X_train, y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## 4.2 Results

In [11]:
y_pred=model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

cfm = confusion_matrix(y_test, y_pred)

true_negative = cfm[0][0]
false_positive = cfm[0][1]
false_negative = cfm[1][0]
true_positive = cfm[1][1]

print('Confusion Matrix: \n', cfm, '\n')

print('True Negative:', true_negative)
print('False Positive:', false_positive)
print('False Negative:', false_negative)
print('True Positive:', true_positive)
print('Correct Predictions', 
      round((true_negative + true_positive) / len(y_pred) * 100, 1), '%')

              precision    recall  f1-score   support

           0      0.979     0.839     0.904        56
           1      0.927     0.991     0.958       115

   micro avg      0.942     0.942     0.942       171
   macro avg      0.953     0.915     0.931       171
weighted avg      0.944     0.942     0.940       171

Confusion Matrix: 
 [[ 47   9]
 [  1 114]] 

True Negative: 47
False Positive: 9
False Negative: 1
True Positive: 114
Correct Predictions 94.2 %
