# **Table of Contents**
* [Exploration & Wrangling](#section_a)
    <br><br>
* [Model Building](#section_b)
    * [LOR](#section_21)
    * [KNN](#section_2)
    * [NB](#section_3)
    * [DT](#section_4)
    * [RF](#section_5)
    * [Bagging](#section_6)
    * [Boosting](#section_7)
    <br><br>
* [Model Selection](#section_c)
    * [Scores](#section_8)
    * [Rank](#section_9)
    * [Best Model](#section_10)

In [1]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import seaborn           as sns

import scipy.stats       as stats 

import warnings
warnings.filterwarnings("ignore")

In [2]:
from  sklearn.linear_model     import   LogisticRegression
from  sklearn.naive_bayes      import   GaussianNB
from  sklearn.neighbors        import   KNeighborsClassifier
from  sklearn.tree             import   DecisionTreeClassifier
from  sklearn.ensemble         import   RandomForestClassifier , BaggingClassifier ,  AdaBoostClassifier , GradientBoostingClassifier ,  VotingClassifier


from  sklearn.preprocessing     import   StandardScaler
from  sklearn.model_selection   import   train_test_split , KFold , cross_val_score , GridSearchCV 

from  sklearn  import  metrics

In [3]:
df = pd.read_csv ('datasets/Breast Cancer.csv')

df.sample(3)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
122,893061,B,11.6,24.49,74.23,417.2,0.07474,0.05688,0.01974,0.01313,...,12.44,31.62,81.39,476.5,0.09545,0.1361,0.07239,0.04815,0.3244,0.06745
274,877486,M,19.18,22.49,127.5,1148.0,0.08523,0.1428,0.1114,0.06772,...,23.36,32.06,166.4,1688.0,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221
263,91227,B,13.9,19.24,88.73,602.9,0.07991,0.05326,0.02995,0.0207,...,16.41,26.42,104.4,830.5,0.1064,0.1415,0.1673,0.0815,0.2356,0.07603


**Target Col - Diagnosis**

<a id='section_a'></a>
# **Part I - Exploration & Wrangling**

In [4]:
df.shape

(569, 32)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   radius_mean        569 non-null    float64
 3   texture_mean       569 non-null    float64
 4   perimeter_mean     569 non-null    float64
 5   area_mean          569 non-null    float64
 6   smoothness_mean    569 non-null    float64
 7   compactness_mean   569 non-null    float64
 8   concavity_mean     569 non-null    float64
 9   points_mean        569 non-null    float64
 10  symmetry_mean      569 non-null    float64
 11  dimension_mean     569 non-null    float64
 12  radius_se          569 non-null    float64
 13  texture_se         569 non-null    float64
 14  perimeter_se       569 non-null    float64
 15  area_se            569 non-null    float64
 16  smoothness_se      569 non

In [6]:
df.drop('id', axis=1 , inplace=True)

In [7]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [8]:
df['diagnosis']  =  df['diagnosis'] .replace ({'B':0 , 'M':1})

In [9]:
df.corr()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
diagnosis,1.0,0.730029,0.415185,0.742636,0.708984,0.35856,0.596534,0.69636,0.776614,0.330499,...,0.776454,0.456903,0.782914,0.733825,0.421465,0.590998,0.65961,0.793566,0.416294,0.323872
radius_mean,0.730029,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.415185,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.742636,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.708984,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,0.35856,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,0.596534,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.69636,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
points_mean,0.776614,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
symmetry_mean,0.330499,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413


>  Comments
    
        * Weakest Corr cols are
        * dimension_mean , texture_se , smoothness_se, symmetry_se , dimension_se 

##  Significance check of weak corr columns

In [10]:
from  scipy.stats  import   ttest_ind , mannwhitneyu

In [11]:
g1 = df [ df['diagnosis'] ==0 ]
g2 = df [ df['diagnosis'] ==1 ]

In [12]:
for i in [ 'dimension_mean' , 'symmetry_se','texture_se', 'dimension_se', 'smoothness_se']:
    
    print (   ttest_ind ( g1[i] , g2[i] ).pvalue    , "    ",    mannwhitneyu ( g1[i] , g2[i] ).pvalue   )

0.7599368037256238      0.26859280106781197
0.8766418183858812      0.013918320482061472
0.8433320287670163      0.32184635051298305
0.06307355082239346      7.860826754962789e-07
0.11029660865789295      0.10681581660023132


> Inference
    
        * Cols to drop ----- dimension_mean ,  texture_se , smoothness_se
        * Cols to LogT ----- symmetry_se , dimension_se

In [13]:
df.drop ( ['dimension_mean','texture_se','smoothness_se'] , axis=1 , inplace=True )

In [14]:
for i in ['symmetry_se', 'dimension_se']:
    
    df[i]  =  np.log ( df[i] ) 

## ii. **Target column**

In [15]:
df ['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

## iii. **Scaling**

In [16]:
x  =  df.drop('diagnosis' , axis=1)
y  =  df ['diagnosis']

In [17]:
from sklearn.preprocessing   import  StandardScaler
sc = StandardScaler()

x  = sc.fit_transform(x)

<a id='section_b'></a>
# **Part II - Model Building**

In [18]:
x_train, x_test, y_train, y_test  =  train_test_split (x, y, test_size =0.3, random_state =0)

kf  =  KFold (shuffle=True , n_splits=5 , random_state=0) 

<a id='section_21'></a>
## **1 - LOR**

In [19]:
LOR = LogisticRegression()           

<a id='section_2'></a>
## **2 - KNN**

### Finding best hyper-parameter

In [22]:
KNN = KNeighborsClassifier ()

param = { 'n_neighbors': np.arange(1,35)   ,   'weights':['uniform','distance'] } 

GS  =  GridSearchCV ( KNN , param , cv=kf , scoring='roc_auc' )

GS.fit (x_train , y_train)

GS.best_params_

{'n_neighbors': 23, 'weights': 'distance'}

In [23]:
KNN = KNeighborsClassifier ( n_neighbors =23  ,  weights ='distance') 

<a id='section_3'></a>
## **3 - Naive Bayes**

In [24]:
NB = GaussianNB()

<a id='section_4'></a>
## **4 - Decision Tree**

### **4.1 - Fully Grown**

In [25]:
FDT  =  DecisionTreeClassifier (random_state=0)             

### **4.2 - Regularised**

* Finding best hyper-parameter

In [26]:
param = { 'max_depth': np.arange(1,20)   ,   'criterion':['entropy','gini'] } 

GS  =  GridSearchCV ( FDT , param , cv=kf , scoring='roc_auc')

GS.fit (x_train , y_train)

GS.best_params_

{'criterion': 'entropy', 'max_depth': 3}

In [42]:
RDT = DecisionTreeClassifier (criterion='entropy' , max_depth=4 ,  random_state=0)

<a id='section_5'></a>
## **5 - Random Forest**

* Finding best n_estimator for least VE

In [27]:
VE = []


for i in np.arange(1,25):
    
    RF = RandomForestClassifier ( n_estimators =i, criterion ='entropy', random_state =0 )
    
    score = cross_val_score ( RF ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    VE.append ( np.std(score,ddof=1) )
    
    
np.argmin(VE)

17

In [28]:
RF = RandomForestClassifier ( n_estimators=18 , criterion='entropy', random_state =0 )  

<a id='section_6'></a>
## **6 - Bagging**

### **6.1 - LOR_Bag**

In [29]:
LOR2 = LogisticRegression()

VE = []

for i in np.arange(1,25):                                                     # 10-15 %
    
    LOR_bag = BaggingClassifier   ( base_estimator = LOR2 , n_estimators =i, random_state =0 )
    
    score   = cross_val_score     ( LOR_bag ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    VE.append ( np.std(score,ddof=1) )
    

np.argmin(VE)

6

In [30]:
LOR_bag = BaggingClassifier   ( base_estimator = LOR2 , n_estimators =7, random_state =0 )

### **6.2 - NB_Bag**

In [31]:
NB2 = GaussianNB()

VE = []

for i in np.arange(1,25):
    
    NB_bag = BaggingClassifier ( base_estimator = NB2 , n_estimators =i, random_state =0 )
    
    score   = cross_val_score ( NB_bag ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    VE.append ( np.std(score,ddof=1) )
    
    
np.argmin(VE) 

13

In [32]:
NB_bag = BaggingClassifier ( base_estimator = NB2 , n_estimators =14, random_state =0 )

### **6.3 - KNN_Bag**

In [33]:
KNN2 = KNeighborsClassifier ()

VE = []

for i in np.arange(1,25):
    
    KNN_bag = BaggingClassifier ( base_estimator = KNN2 , n_estimators =i, random_state =0 )
    
    score   = cross_val_score ( KNN_bag ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    VE.append ( np.std(score,ddof=1) )
    

np.argmin(VE) 

6

In [34]:
KNN_bag  =  BaggingClassifier ( base_estimator = KNN2 , n_estimators =7 , random_state =0 )

<a id='section_7'></a>
## **7 - Boosting**

## **7.1 - AdaBoost**

### 7.1.1 - **Regularised DT boost** (default)

In [36]:
BE = []

for i in np.arange(1,35):                                                                    
    
    RDT_boost   =  AdaBoostClassifier   ( n_estimators =i, random_state =0 )      # default base_estimator : Reg DT
    
    score       =  cross_val_score      ( RDT_boost ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    BE.append ( np.mean(1-score)  )
    
    
np.argmin(BE)

31

In [37]:
RDT_boost =  AdaBoostClassifier   ( n_estimators =31, random_state =0 )   

### 7.1.2 - **RF boost**

* Hyper-param tuned RF model is now boosted

In [38]:
BE = []

for i in np.arange(1,35):                                                                    
    
    RF_boost =  AdaBoostClassifier ( base_estimator = RF, n_estimators =i, random_state =0 )                   
    
    score    = cross_val_score     ( RF_boost ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    BE.append ( np.mean(1-score)    )
    
    
np.argmin(BE)

0

* No scope for boosting here

## **7.2 - XG Boost**

In [39]:
BE = []

for i in np.arange(1,35):                                                                    
    
    XG_boost =  GradientBoostingClassifier  ( n_estimators = i , random_state =0 )                                   
    
    score    =  cross_val_score             ( XG_boost ,  x_train ,  y_train ,  cv=kf,  scoring='roc_auc' )
    
    BE.append ( np.mean(1-score)    )
    
    
np.argmin(BE)

23

In [40]:
XG_boost =  GradientBoostingClassifier  ( n_estimators =24, random_state =0 ) 

<a id='section_c'></a>
# **Part III - Model Selection**

<a id='section_8'></a>
## i. **Scores**

In [43]:
models = []

models.append  (( 'LOR'       , LOR        ))
models.append  (( 'NB'        , NB         ))
models.append  (( 'KNN'       , KNN        ))



models.append  (( 'FDT'       , FDT        ))
models.append  (( 'RDT'       , RDT        ))
models.append  (( 'RF'        , RF         ))



models.append  (( 'LOR_bag'   , LOR_bag    ))
models.append  (( 'NB_bag'    , NB_bag     ))
models.append  (( 'KNN_bag'   , KNN_bag    ))


models.append    (( 'RDT_boost' , RDT_boost  ))
# models.append  (( 'RF_boost'  , RF_boost   ))
models.append    (( 'XG_boost'  , XG_boost   ))

In [44]:
result =[]

for  i,j  in  models:
    
    score = cross_val_score ( j , x_train , y_train , cv =kf, scoring ='roc_auc' )
    
    result.append ( { 'Name': i   ,   'BE': np.mean(1-score)   ,   'VE': np.std(score,ddof=1)  } )                


table = pd.DataFrame(result)

<a id='section_9'></a>
## ii - **Rank**

### By least Bias Error

In [45]:
table.sort_values (by='BE')    .head(8)      .style.hide_index()

Name,BE,VE
LOR,0.00569,0.005976
LOR_bag,0.005962,0.005718
KNN,0.010135,0.010265
RF,0.013062,0.00882
NB_bag,0.0134,0.008573
RDT_boost,0.015691,0.010842
NB,0.016311,0.01019
XG_boost,0.017923,0.013359


### By least Variance Error

In [46]:
table.sort_values (by='VE')    .head(8)      .style.hide_index()

Name,BE,VE
LOR_bag,0.005962,0.005718
LOR,0.00569,0.005976
NB_bag,0.0134,0.008573
RF,0.013062,0.00882
NB,0.016311,0.01019
KNN,0.010135,0.010265
RDT_boost,0.015691,0.010842
KNN_bag,0.018531,0.011067


<a id='section_10'></a>
## iii - **Best Model**

## *Logistic Regressor Bagged*

In [47]:
LOR_bag.fit      ( x_train , y_train )

y_pred   =   LOR_bag.predict ( x_test )

In [48]:
print ( metrics.f1_score                ( y_pred , y_test ) )

print ( metrics.classification_report   ( y_pred , y_test ) )

0.9655172413793104
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       113
           1       0.97      0.97      0.97        58

    accuracy                           0.98       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171



In [49]:
print  ( y_test .value_counts()        )              # ACTUAL

print  ( pd .value_counts ( y_pred )   )              # PREDICTED

0    113
1     58
Name: diagnosis, dtype: int64
0    113
1     58
dtype: int64
