# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Loading and Analyzing dataset

In [2]:
breastcancer = datasets.load_breast_cancer()
breastcancer_data = breastcancer.data
breastcancer_data = pd.DataFrame(breastcancer_data, columns = breastcancer.feature_names)
breastcancer_data['class'] = breastcancer.target
breastcancer_data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
breastcancer_data.shape

(569, 31)

In [4]:
breastcancer_data.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
class                      0
dtype: int64

In [5]:
breastcancer_data.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


# Train-Test Split

In [6]:
X = breastcancer_data.values[:, 0:30]
Y = breastcancer_data.values[:,30]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 12)

# Modelling and Prediction

In [7]:
# accuracies of different models on training data
classifiers = [
    SVC(probability=True),
    LogisticRegression(),
    DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,
                            random_state = 0),
    RandomForestClassifier(),
    KNeighborsClassifier()]
model_dict={}
for clf in classifiers:
    clf_name=clf.__class__.__name__
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    acc_score=round(clf.score(X_train,y_train)*100,2)
    if clf_name in model_dict:
        model_dict[clf_name]+=acc_score
    else:
        model_dict[clf_name]=acc_score
        
for key,value in model_dict.items():
    print(key,value)
    



SVC 100.0
LogisticRegression 96.23
DecisionTreeClassifier 97.49
RandomForestClassifier 100.0
KNeighborsClassifier 96.23




In [8]:
# accuracies of different models on test data
model_dict={}
for clf in classifiers:
    clf_name=clf.__class__.__name__
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    acc_score=round(clf.score(X_test,y_test)*100,2)
    if clf_name in model_dict:
        model_dict[clf_name]+=acc_score
    else:
        model_dict[clf_name]=acc_score
        
for key,value in model_dict.items():
    print(key,value)
    



SVC 62.57
LogisticRegression 93.57
DecisionTreeClassifier 90.06
RandomForestClassifier 92.4
KNeighborsClassifier 88.89


In [9]:
# best model with least difference in training and test set accuracies
model_dict={}
for clf in classifiers:
    clf_name=clf.__class__.__name__
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    acc_score_train=round(clf.score(X_train,y_train)*100,2)
    acc_score_test=round(clf.score(X_test,y_test)*100,2)
    if clf_name in model_dict:
        model_dict[clf_name]+=(acc_score_train-acc_score_test)
    else:
        model_dict[clf_name]=(acc_score_train-acc_score_test)
score_list=[]       
for key,value in model_dict.items():
    print(key,value)
    score_list.append(value)
for key,value in model_dict.items():
    if value == min(score_list):
        print("Best model with similar training and test set accuracies and least overfitting is:",key)




SVC 37.43
LogisticRegression 2.660000000000011
DecisionTreeClassifier 7.429999999999993
RandomForestClassifier 11.450000000000003
KNeighborsClassifier 7.340000000000003
Best model with similar training and test set accuracies and least overfitting is: LogisticRegression


In [10]:
# best model fitting
clf = LogisticRegression().fit(X_train, y_train)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Logistic Regression classifier on training set: 0.96
Accuracy of Logistic Regression classifier on test set: 0.94




# Evaluation

In [11]:
print("Detailed classification report:")
print()
print('The model is trained on full development set')
print('The scores are evaluated on full evaluation set')
print()
y_true,y_pred=y_test,clf.predict(X_test)
print(classification_report(y_true,y_pred))
print()

Detailed classification report:

The model is trained on full development set
The scores are evaluated on full evaluation set

              precision    recall  f1-score   support

         0.0       0.96      0.86      0.91        64
         1.0       0.92      0.98      0.95       107

   micro avg       0.94      0.94      0.94       171
   macro avg       0.94      0.92      0.93       171
weighted avg       0.94      0.94      0.93       171




In [12]:
y_predict=clf.predict(X_test)
confusion=confusion_matrix(y_test,y_predict)
confusion

array([[ 55,   9],
       [  2, 105]], dtype=int64)