In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.impute import KNNImputer

## 1.Problem Statement

##### Predicting Breast Cancer Survival Using Treatment and Patient Factors

## 2.Data Gathring

In [41]:
df=pd.read_csv('breast-cancer.csv')

In [42]:
df.columns = df.columns.str.replace(' ', '_')
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## 3.EDA

In [43]:
df.drop(['id'],axis=1,inplace=True) #as id is unique for all patient,it will not contribute in prediction so we droped id column

In [44]:
#df["diagnosis"].replace({'B': 0, 'M': 1},inplace=True) # converting datatype of traget colum from object to int#

In [45]:
df.info() # from this we conclude that there is no null values are present in dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave_points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [46]:
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


## 4.Feature Engineering

In [47]:
# detecting outliers
class Detection():
    def __init__(self,Dataframe):
        self.Dataframe=Dataframe
    def detect(self,col,n):
        self.col=col
        self.n=n
        q1=self.Dataframe[self.col].quantile(0.25)
        q2=self.Dataframe[self.col].quantile(0.5)
        q3=self.Dataframe[self.col].quantile(0.75)
        iqr=q3-q1
        self.lower_tail=q1-self.n*iqr
        self.upper_tail=q3+self.n*iqr
        print(f"lower_tail for {self.col} is {self.lower_tail}")
        print(f"upper_tail for {self.col} is {self.upper_tail}")
        outliers=self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]
        print(f"Outliers for this {self.col} is {outliers}")
    def mean(self):
        self.MEAN=self.Dataframe.loc[(self.Dataframe[self.col]<self.upper_tail)|(self.Dataframe[self.col]>self.lower_tail),self.col].mean()
        print(self.MEAN)
    def median(self):
        self.MEDIAN=self.Dataframe.loc[(self.Dataframe[self.col]<self.upper_tail)|(self.Dataframe[self.col]>self.lower_tail),self.col].median()
        print(self.MEDIAN)
    def replace_with_lowertail(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]<self.lower_tail),self.col]=self.lower_tail
        
    def replace_with_uppertail(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail),self.col]=self.upper_tail
    
    def replace_with_mean(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=self.MEAN
    
    def replace_with_median(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=self.MEDIAN
    
    def replace_with_statisvalue(self,n):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=n
        
out=Detection(df)

In [48]:
# Replaced all outliers with null value so that we can fill that value using knn imputer

In [49]:
out.detect('radius_mean',3)

lower_tail for radius_mean is -0.5400000000000009
upper_tail for radius_mean is 28.02
Outliers for this radius_mean is 212    28.11
Name: radius_mean, dtype: float64


In [50]:
out.replace_with_statisvalue(np.nan)

In [51]:
out.detect('texture_mean',3)

lower_tail for texture_mean is -0.7199999999999953
upper_tail for texture_mean is 38.69
Outliers for this texture_mean is 239    39.28
Name: texture_mean, dtype: float64


In [52]:
out.replace_with_statisvalue(np.nan)

In [53]:
out.detect('area_mean',3)

lower_tail for area_mean is -666.9000000000001
upper_tail for area_mean is 1869.9
Outliers for this area_mean is 82     1878.0
180    2250.0
212    2499.0
352    2010.0
461    2501.0
Name: area_mean, dtype: float64


In [54]:
out.replace_with_statisvalue(np.nan)

In [55]:
out.detect('smoothness_mean',3)

lower_tail for smoothness_mean is 0.029579999999999995
upper_tail for smoothness_mean is 0.16209
Outliers for this smoothness_mean is 504    0.1634
Name: smoothness_mean, dtype: float64


In [56]:
out.replace_with_statisvalue(np.nan)

In [57]:
out.detect('compactness_mean',3)

lower_tail for compactness_mean is -0.13151999999999994
upper_tail for compactness_mean is 0.3268399999999999
Outliers for this compactness_mean is 78    0.3454
Name: compactness_mean, dtype: float64


In [58]:
out.replace_with_statisvalue(np.nan)

In [59]:
out.detect('symmetry_mean',3)

lower_tail for symmetry_mean is 0.060499999999999915
upper_tail for symmetry_mean is 0.2971000000000001
Outliers for this symmetry_mean is 25    0.304
Name: symmetry_mean, dtype: float64


In [60]:
out.replace_with_statisvalue(np.nan)

In [61]:
out.detect('fractal_dimension_mean',3)

lower_tail for fractal_dimension_mean is 0.03244000000000001
upper_tail for fractal_dimension_mean is 0.09137999999999999
Outliers for this fractal_dimension_mean is 3      0.09744
152    0.09296
504    0.09502
505    0.09575
Name: fractal_dimension_mean, dtype: float64


In [62]:
out.replace_with_statisvalue(np.nan)

In [63]:
out.detect('radius_se',3)

lower_tail for radius_se is -0.5071000000000001
upper_tail for radius_se is 1.2184
Outliers for this radius_se is 122    1.509
138    1.296
212    2.873
258    1.292
417    1.370
461    2.547
503    1.291
Name: radius_se, dtype: float64


In [64]:
out.replace_with_statisvalue(np.nan)

In [65]:
out.detect('texture_se',3)

lower_tail for texture_se is -1.0864000000000003
upper_tail for texture_se is 3.3943000000000003
Outliers for this texture_se is 12     3.568
192    4.885
473    3.647
561    3.896
Name: texture_se, dtype: float64


In [66]:
out.replace_with_statisvalue(np.nan)

In [67]:
out.detect('perimeter_se',3)

lower_tail for perimeter_se is -3.6470000000000002
upper_tail for perimeter_se is 8.61
Outliers for this perimeter_se is 12     11.070
42      8.830
78      8.649
108    10.050
122     9.807
212    21.980
258    10.120
272     8.867
417     9.424
461    18.650
503     9.635
563     8.758
Name: perimeter_se, dtype: float64


In [68]:
out.replace_with_statisvalue(np.nan)

In [69]:
out.detect('area_se',3)

lower_tail for area_se is -64.16999999999999
upper_tail for area_se is 127.20999999999998
Outliers for this area_se is 0      153.4
77     134.8
108    170.0
122    233.0
180    128.7
212    525.6
236    155.8
250    137.9
258    138.5
265    199.7
272    156.8
300    133.0
302    130.8
339    164.1
352    153.1
368    224.1
369    130.2
417    176.5
461    542.2
503    180.2
521    139.9
564    158.7
Name: area_se, dtype: float64


In [70]:
out.replace_with_statisvalue(np.nan)

In [71]:
out.detect('smoothness_se',3)

lower_tail for smoothness_se is -0.0037620000000000015
upper_tail for smoothness_se is 0.017077000000000002
Outliers for this smoothness_se is 71     0.01721
116    0.01835
122    0.02333
213    0.03113
314    0.02075
345    0.01736
505    0.02177
Name: smoothness_se, dtype: float64


In [72]:
out.replace_with_statisvalue(np.nan)

In [73]:
out.detect('compactness_se',3)

lower_tail for compactness_se is -0.045029999999999994
upper_tail for compactness_se is 0.09056
Outliers for this compactness_se is 42     0.10060
71     0.09368
122    0.09806
152    0.09586
190    0.13540
290    0.10640
Name: compactness_se, dtype: float64


In [74]:
out.replace_with_statisvalue(np.nan)

In [75]:
out.detect('concavity_se',3)

lower_tail for concavity_se is -0.06578999999999999
upper_tail for concavity_se is 0.12292999999999998
Outliers for this concavity_se is 68     0.3038
112    0.1435
122    0.1278
152    0.3960
213    0.1438
376    0.1535
Name: concavity_se, dtype: float64


In [76]:
out.replace_with_statisvalue(np.nan)

In [77]:
out.detect('concave_points_se',3)

lower_tail for concave_points_se is -0.013578000000000003
upper_tail for concave_points_se is 0.035926
Outliers for this concave_points_se is 12     0.04090
152    0.05279
213    0.03927
Name: concave_points_se, dtype: float64


In [78]:
out.replace_with_statisvalue(np.nan)

In [79]:
out.detect('symmetry_se',3)

lower_tail for symmetry_se is -0.009800000000000003
upper_tail for symmetry_se is 0.048440000000000004
Outliers for this symmetry_se is 3      0.05963
42     0.05333
78     0.07895
119    0.05014
138    0.05168
146    0.05628
190    0.05113
314    0.06146
351    0.05543
Name: symmetry_se, dtype: float64


In [80]:
out.replace_with_statisvalue(np.nan)

In [81]:
out.detect('fractal_dimension_se',3)

lower_tail for fractal_dimension_se is -0.004682000000000001
upper_tail for fractal_dimension_se is 0.011488000000000002
Outliers for this fractal_dimension_se is 12     0.01284
71     0.02193
112    0.01298
151    0.01178
152    0.02984
176    0.01792
190    0.01172
213    0.01256
290    0.02286
376    0.01220
388    0.01233
Name: fractal_dimension_se, dtype: float64


In [82]:
out.replace_with_statisvalue(np.nan)

In [83]:
out.detect('perimeter_worst',3)

lower_tail for perimeter_worst is -39.76000000000002
upper_tail for perimeter_worst is 249.27000000000004
Outliers for this perimeter_worst is 461    251.2
Name: perimeter_worst, dtype: float64


In [84]:
out.replace_with_statisvalue(np.nan)

In [85]:
out.detect('area_worst',3)

lower_tail for area_worst is -1190.8000000000002
upper_tail for area_worst is 2790.1000000000004
Outliers for this area_worst is 180    3216.0
236    2944.0
265    3432.0
339    2906.0
352    3234.0
368    3143.0
461    4254.0
Name: area_worst, dtype: float64


In [86]:
out.replace_with_statisvalue(np.nan)

In [87]:
out.detect('compactness_worst',3)

lower_tail for compactness_worst is -0.4285000000000001
upper_tail for compactness_worst is 0.9148000000000001
Outliers for this compactness_worst is 9      1.0580
190    0.9327
379    0.9379
Name: compactness_worst, dtype: float64


In [88]:
out.replace_with_statisvalue(np.nan)

In [89]:
out.detect('concavity_worst',3)

lower_tail for concavity_worst is -0.6907000000000001
upper_tail for concavity_worst is 1.1881000000000002
Outliers for this concavity_worst is 68    1.252
Name: concavity_worst, dtype: float64


In [90]:
out.replace_with_statisvalue(np.nan)

In [91]:
out.detect('symmetry_worst',3)

lower_tail for symmetry_worst is 0.0479
upper_tail for symmetry_worst is 0.5204
Outliers for this symmetry_worst is 3      0.6638
78     0.5440
146    0.5774
323    0.5558
Name: symmetry_worst, dtype: float64


In [92]:
out.replace_with_statisvalue(np.nan)

In [93]:
out.detect('fractal_dimension_worst',3)

lower_tail for fractal_dimension_worst is 0.009599999999999997
upper_tail for fractal_dimension_worst is 0.15394
Outliers for this fractal_dimension_worst is 3    0.1730
9    0.2075
Name: fractal_dimension_worst, dtype: float64


In [94]:
out.replace_with_statisvalue(np.nan)

In [95]:
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,,
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [96]:
df.info() # Replaced all outliers with null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              568 non-null    float64
 2   texture_mean             568 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                564 non-null    float64
 5   smoothness_mean          568 non-null    float64
 6   compactness_mean         568 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave_points_mean      569 non-null    float64
 9   symmetry_mean            568 non-null    float64
 10  fractal_dimension_mean   565 non-null    float64
 11  radius_se                562 non-null    float64
 12  texture_se               565 non-null    float64
 13  perimeter_se             557 non-null    float64
 14  area_se                  5

In [97]:
# filling all null values using knn imputer method
x=df.drop('diagnosis',axis=1)
y=df['diagnosis']
imputer=KNNImputer()
array=imputer.fit_transform(x)
df=pd.DataFrame(array,columns=x.columns)
df

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.078710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.46010,0.118900
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.056670,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.27500,0.089020
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.059990,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.36130,0.087580
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.063874,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.26736,0.080122
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.058830,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.23640,0.076780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.056230,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.20600,0.071150
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.055330,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.25720,0.066370
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.056480,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.22180,0.078200
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.070160,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.40870,0.124000


In [98]:
df.insert(loc = 0,column = 'diagnosis',value = y)
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.46010,0.118900
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.27500,0.089020
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.36130,0.087580
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.26736,0.080122
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.23640,0.076780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.20600,0.071150
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.25720,0.066370
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.22180,0.078200
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.40870,0.124000


In [99]:
df.info() # checked here all null values are replaced

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave_points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

# 5.Train test split

In [100]:
x=df.drop('diagnosis',axis=1)
y=df['diagnosis']

In [101]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

# 6.Model selection

In [102]:
logistic_model=LogisticRegression()
logistic_model.fit(x_train,y_train)

# 7.Model Evaluation

In [103]:
# tarining evaluation
y_train_pred=logistic_model.predict(x_train)

cm=confusion_matrix(y_train,y_train_pred)
print("Confusion Matrix:\n",cm)
print("*"*80)

accuracy=accuracy_score(y_train,y_train_pred)
print("Training Accuracy:",accuracy)
print("*"*80)

clf_report=classification_report(y_train,y_train_pred)
print("Training Classification Report:\n",clf_report)

Confusion Matrix:
 [[278   7]
 [ 15 155]]
********************************************************************************
Training Accuracy: 0.9516483516483516
********************************************************************************
Training Classification Report:
               precision    recall  f1-score   support

           B       0.95      0.98      0.96       285
           M       0.96      0.91      0.93       170

    accuracy                           0.95       455
   macro avg       0.95      0.94      0.95       455
weighted avg       0.95      0.95      0.95       455



In [104]:
# testing evaluation
y_test_pred=logistic_model.predict(x_test)

cm=confusion_matrix(y_test,y_test_pred)
print("Confusion Matrix:\n",cm)
print("*"*80)

accuracy=accuracy_score(y_test,y_test_pred)
print("Testing Accuracy:",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_test_pred)
print("Testing Classification Report:\n",clf_report)

Confusion Matrix:
 [[70  2]
 [ 7 35]]
********************************************************************************
Testing Accuracy: 0.9210526315789473
********************************************************************************
Testing Classification Report:
               precision    recall  f1-score   support

           B       0.91      0.97      0.94        72
           M       0.95      0.83      0.89        42

    accuracy                           0.92       114
   macro avg       0.93      0.90      0.91       114
weighted avg       0.92      0.92      0.92       114



# 8.preprocessing step

## 1.Normalization

In [105]:
normal_scalar = MinMaxScaler()
array = normal_scalar.fit_transform(x)
x1 = pd.DataFrame(array,columns=x.columns)
x1

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,0.538627,0.027801,0.545989,0.505155,0.714348,0.884255,0.703140,0.731113,0.736186,0.721637,...,0.620776,0.141525,0.750126,0.706177,0.601136,0.759161,0.608462,0.912027,0.843099,0.682557
1,0.664856,0.334440,0.615783,0.696613,0.348756,0.202931,0.203608,0.348757,0.407367,0.168424,...,0.606901,0.303571,0.605903,0.681916,0.347553,0.189472,0.206496,0.639175,0.329075,0.363189
2,0.621801,0.478838,0.595743,0.624153,0.618768,0.481200,0.462512,0.635686,0.546587,0.251757,...,0.556386,0.360075,0.570686,0.586799,0.483590,0.472414,0.384957,0.835052,0.568731,0.347798
3,0.217183,0.442739,0.233501,0.142916,0.976105,0.905828,0.565604,0.522863,0.832611,0.349247,...,0.248310,0.385928,0.270893,0.147297,0.915472,0.997859,0.587094,0.884880,0.307859,0.268085
4,0.651157,0.192116,0.630986,0.679529,0.517758,0.388398,0.463918,0.518390,0.405742,0.222641,...,0.519744,0.123934,0.569009,0.535197,0.437364,0.211356,0.341880,0.558419,0.221883,0.232364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.713293,0.526141,0.678668,0.786745,0.633974,0.330525,0.571462,0.690358,0.360780,0.157380,...,0.623266,0.383262,0.646710,0.709258,0.461137,0.218848,0.351026,0.761512,0.137462,0.172189
565,0.643329,0.769295,0.604036,0.658321,0.490605,0.287720,0.337395,0.486630,0.374865,0.134789,...,0.560655,0.699094,0.584661,0.595271,0.300007,0.196132,0.274786,0.559450,0.279645,0.121099
566,0.470620,0.762241,0.445788,0.420972,0.346693,0.283953,0.216753,0.263519,0.287107,0.163655,...,0.393099,0.589019,0.426463,0.361522,0.282177,0.335522,0.290855,0.487285,0.181339,0.247542
567,0.666324,0.814108,0.665538,0.660677,0.707831,0.882200,0.823336,0.755467,0.724269,0.507028,...,0.633582,0.730277,0.750126,0.629929,0.619626,1.000000,0.802308,0.910653,0.700361,0.737067


In [106]:
x_train,x_test,y_train,y_test=train_test_split(x1,y,test_size=0.2,random_state=42,stratify=y)

In [107]:
logistic_model_norm=LogisticRegression()
logistic_model_norm.fit(x_train,y_train)

In [108]:
# tarining evaluation
y_train_pred=logistic_model_norm.predict(x_train)

cm=confusion_matrix(y_train,y_train_pred)
print("Confusion Matrix:\n",cm)
print("*"*80)

accuracy=accuracy_score(y_train,y_train_pred)
print("Training Accuracy:",accuracy)
print("*"*80)

clf_report=classification_report(y_train,y_train_pred)
print("Training Classification Report:\n",clf_report)

Confusion Matrix:
 [[285   0]
 [ 13 157]]
********************************************************************************
Training Accuracy: 0.9714285714285714
********************************************************************************
Training Classification Report:
               precision    recall  f1-score   support

           B       0.96      1.00      0.98       285
           M       1.00      0.92      0.96       170

    accuracy                           0.97       455
   macro avg       0.98      0.96      0.97       455
weighted avg       0.97      0.97      0.97       455



In [109]:
# testing evaluation
y_test_pred=logistic_model_norm.predict(x_test)

cm=confusion_matrix(y_test,y_test_pred)
print("Confusion Matrix:\n",cm)
print("*"*80)

accuracy=accuracy_score(y_test,y_test_pred)
print("Testing Accuracy:",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_test_pred)
print("Testing Classification Report:\n",clf_report)

Confusion Matrix:
 [[72  0]
 [ 5 37]]
********************************************************************************
Testing Accuracy: 0.956140350877193
********************************************************************************
Testing Classification Report:
               precision    recall  f1-score   support

           B       0.94      1.00      0.97        72
           M       1.00      0.88      0.94        42

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



## 2.Standardization

In [110]:
std_scalar = StandardScaler()
array = std_scalar.fit_transform(x)

x2 = pd.DataFrame(array,columns=x.columns)
x2

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,1.103744,-2.105124,1.269934,1.059665,1.609956,3.350691,2.652874,2.532475,2.265750,2.458655,...,1.886690,-1.359293,2.323732,2.179709,1.307686,2.798272,2.160529,2.296076,3.048245,2.088240
1,1.840061,-0.353869,1.685955,2.040062,-0.833666,-0.489172,-0.023846,0.548144,0.009328,-0.909545,...,1.805927,-0.369203,1.549246,2.060234,-0.375612,-0.431829,-0.139935,1.087084,-0.228893,0.320379
2,1.588914,0.470808,1.566503,1.669019,0.971101,1.079119,1.363478,2.037231,0.964683,-0.402175,...,1.511870,-0.023974,1.360127,1.591815,0.527407,1.172432,0.881406,1.955000,1.299021,0.235181
3,-0.771295,0.264639,-0.592687,-0.795248,3.359549,3.472279,1.915897,1.451707,2.927436,0.191386,...,-0.281464,0.133984,-0.249784,-0.572584,3.394275,4.151677,2.038242,2.175786,-0.364157,-0.206074
4,1.760151,-1.166698,1.776573,1.952581,0.295947,0.556098,1.371011,1.428493,-0.001824,-0.579449,...,1.298575,-1.466770,1.351121,1.337693,0.220556,-0.307750,0.634876,0.729259,-0.912294,-0.403805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.122601,0.740961,2.060786,2.501603,1.072737,0.229934,1.947285,2.320965,-0.310363,-0.976787,...,1.901185,0.117700,1.768383,2.194880,0.378365,-0.265267,0.687215,1.629151,-1.450517,-0.736906
565,1.714488,2.129642,1.615931,1.843983,0.114454,-0.011312,0.693043,1.263669,-0.213712,-1.114326,...,1.536720,2.047399,1.435174,1.633536,-0.691230,-0.394066,0.250895,0.733827,-0.544037,-1.019716
566,0.707047,2.089356,0.672676,0.628591,-0.847460,-0.032542,0.046588,0.105777,-0.815920,-0.938581,...,0.561361,1.374854,0.585640,0.482402,-0.809587,0.396263,0.342854,0.414069,-1.170783,-0.319790
567,1.848623,2.385576,1.982524,1.856049,1.566398,3.339111,3.296944,2.658866,2.183969,1.152026,...,1.961239,2.237926,2.323732,1.804215,1.430427,4.163815,3.269916,2.289985,2.138224,2.389984


In [111]:
x_train,x_test,y_train,y_test=train_test_split(x2,y,test_size=0.2,random_state=42,stratify=y)

In [112]:
logistic_model_std=LogisticRegression()
logistic_model_std.fit(x_train,y_train)

In [113]:
# tarining evaluation
y_train_pred=logistic_model_std.predict(x_train)

cm=confusion_matrix(y_train,y_train_pred)
print("Confusion Matrix:\n",cm)
print("*"*80)

accuracy=accuracy_score(y_train,y_train_pred)
print("Training Accuracy:",accuracy)
print("*"*80)

clf_report=classification_report(y_train,y_train_pred)
print("Training Classification Report:\n",clf_report)

Confusion Matrix:
 [[285   0]
 [  6 164]]
********************************************************************************
Training Accuracy: 0.9868131868131869
********************************************************************************
Training Classification Report:
               precision    recall  f1-score   support

           B       0.98      1.00      0.99       285
           M       1.00      0.96      0.98       170

    accuracy                           0.99       455
   macro avg       0.99      0.98      0.99       455
weighted avg       0.99      0.99      0.99       455



In [114]:
# testing evaluation
y_test_pred=logistic_model_std.predict(x_test)

cm=confusion_matrix(y_test,y_test_pred)
print("Confusion Matrix:\n",cm)
print("*"*80)

accuracy=accuracy_score(y_test,y_test_pred)
print("Testing Accuracy:",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_test_pred)
print("Testing Classification Report:\n",clf_report)

Confusion Matrix:
 [[72  0]
 [ 2 40]]
********************************************************************************
Testing Accuracy: 0.9824561403508771
********************************************************************************
Testing Classification Report:
               precision    recall  f1-score   support

           B       0.97      1.00      0.99        72
           M       1.00      0.95      0.98        42

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [115]:
0.9868131868131869-0.9824561403508771

0.004357046462309722

# New Dataset prediction

In [116]:
x.head(1)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189


In [117]:
y.head(1)

0    M
Name: diagnosis, dtype: object

In [118]:
std_scalar.transform(x.head(1))

array([[ 1.10374445, -2.10512371,  1.26993369,  1.05966458,  1.60995633,
         3.35069061,  2.65287398,  2.53247522,  2.26575038,  2.45865532,
         3.0598136 , -0.59153984,  3.74646881,  2.00893414, -0.19505342,
         1.53294896,  1.10860594,  0.74151766,  1.49523324,  1.45016348,
         1.88668963, -1.35929347,  2.32373188,  2.17970899,  1.30768627,
         2.79827168,  2.16052865,  2.29607613,  3.04824475,  2.08824015]])

In [119]:
logistic_model_std.predict(std_scalar.transform(x.head(1)))

array(['M'], dtype=object)

## Model pickle

In [120]:
import pickle

In [121]:
pickle.dump(logistic_model_std,open('cancer.pkl','wb'))

In [122]:
pickle.dump(std_scalar,open('scaling.pkl','wb'))

## json data

In [123]:
import json
project_data={"columns" : list(x.columns)}

In [124]:
with open("project_data.json","w") as f:
    json.dump(project_data ,f)