# **Breast Cancer Detection**

## **Data PreProcessing**

### **Import Libraries and Dataset**

In [None]:
#  import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
sns.set_style("darkgrid")

In [None]:
#  import dataset
df = pd.read_csv('data.csv')

In [None]:
df.head()

### **Data Exploration**


In [None]:
# shape of the dataset
df.shape

In [None]:
# info of the dataset
df.info()

In [None]:
# checking the categorical columns
df.select_dtypes(include=["object"]).columns

In [None]:
# checking the the length categorical columns
len(df.select_dtypes(include=["object"]).columns)

In [None]:
# checking the numerical columns
df.select_dtypes(include=["float64","int64"]).columns


In [None]:
# checking the the numerical columns
len(df.select_dtypes(include=["float64","int64"]).columns)

In [None]:
# Statistical summary of the dataset
df.describe()

### **Dealing with missing Values**

In [None]:
# checking the missing values
df.isnull().sum()

There is no value in "Unnamed: 32", so we can drop it

In [None]:
df.drop(["Unnamed: 32"],axis=1,inplace=True)

### **Dealing with categorical data**

In [None]:
df.select_dtypes(include=["object"]).columns

In [None]:
df["diagnosis"].unique()

In [None]:
df.head()

In [None]:
df = pd.get_dummies(columns=["diagnosis"],data=df)

In [None]:
df.replace({True:1,False:0},inplace=True)

In [None]:
df.head()

### **Countplot**

In [None]:
sns.countplot(x=df["diagnosis_M"],label="Count",palette = "Set1",edgecolor = "black")
plt.show

In [None]:
# No of Benign
(df["diagnosis_M"]==0).sum()

In [None]:
# No of Malignant
(df["diagnosis_M"]==1).sum()

### **Correlation Matrix and Heatmap**

In [None]:
df2 = df.drop(["diagnosis_B","diagnosis_M"],axis=1)

In [None]:
# The plot show us the Correlation of diagnosis_M with other
df2.corrwith(df["diagnosis_M"]).plot.bar(
    figsize = (30,15),title = "Correlation with Diagnosis",fontsize = 15,rot = 45,grid = True
)

In [None]:
# Correlation Matrix
corr = df.corr()

In [None]:
corr

In [None]:
# Heatmap
plt.figure(figsize=(30,20))
sns.heatmap(corr,annot=True,fmt=".2f",cmap="coolwarm")

### **Splitting the dataset into the Training set and Test set**

In [None]:
# Independent Variables
x = pd.DataFrame(df.drop(["diagnosis_B","diagnosis_M","id",],axis=1))
x.shape

In [None]:
# Dependent Variables
y= pd.DataFrame(df["diagnosis_M"])
y.shape

In [None]:
# Splitting the dataset into the Training set and Test set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
SS = StandardScaler()

In [None]:
x_train = SS.fit_transform(x_train)
x_test = SS.transform(x_test)

## **Bulding Moddel**

### **Logistic Regression**

In [None]:
LR = LogisticRegression(random_state=0)

In [None]:
LR.fit(x_train,y_train)

In [None]:
y_pred = LR.predict(x_test)

In [None]:
def acc_test(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)

    return print("Accuracy:",acc,"\n F1 Score:",f1,"\n Precision:",prec,"\n Recall:",rec)

In [None]:
# Accuracy of the Logistic Regression Model
acc_test(y_test,y_pred)

In [None]:
# Confusion Matrix
conf=confusion_matrix(y_test,y_pred)
print(conf)

#### **Cross Validation for Logistic Regression**

In [None]:
# Cross Validation
cross_acc = cross_val_score(estimator=LR,X=x_train,y=y_train,cv=10)

In [None]:
print("Accuracy is {:.2f} %".format(cross_acc.mean()*100))
print("Standard Deviation is {:.2f} %".format(cross_acc.std()*100))

### **Random Forest**

In [None]:
# Random Forest Classifier
RCL = RandomForestClassifier(random_state=0)
RCL.fit(x_train,y_train)

In [None]:
# Predicting the Test set results
y_pred = RCL.predict(x_test)

In [None]:
def acc_test(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)

    return print("Accuracy:",acc,"\n F1 Score:",f1,"\n Precision:",prec,"\n Recall:",rec)

In [None]:
# Accuracy of the Random Forest Regression Model
acc_test(y_test,y_pred)

Accuracy of the Logistic Regression Model

Accuracy: 0.9649122807017544 

 F1 Score: 0.9574468085106383 

 Precision: 0.9574468085106383 
 
 Recall: 0.9574468085106383

In [None]:
# Confusion Matrix
conf=confusion_matrix(y_test,y_pred)
print(conf)

#### **Validation for randon Forest**

In [None]:
# Cross Validation for randon Forest
cross_acc = cross_val_score(estimator=RCL,X=x_train,y=y_train,cv=10)

In [None]:
print("Accuracy is {:.2f} %".format(cross_acc.mean()*100))
print("Standard Deviation is {:.2f} %".format(cross_acc.std()*100))

Cross Validation for Logistic Regression
Accuracy is 97.81 %

Standard Deviation is 1.98 %

## **To find the best parameter for Logistic Regression**

In [None]:
Parameters = {"penalty":["l1", "l2", "elasticnet", "None"],"C":[0.25,0.5,0.75,1,1.25,1.5,1.75,2],
              "solver":["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]}

In [None]:
random_search = RandomizedSearchCV(estimator=LR,param_distributions=Parameters,n_iter=5,scoring="roc_auc",n_jobs=-1,cv=5,verbose=3)

In [None]:
random_search.fit(x_train,y_train)

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

### **Final Model (Logistic Regression)**

In [None]:
Logic_Reg = LogisticRegression(C=1.25, random_state=0, solver='newton-cg')
Logic_Reg.fit(x_train,y_train)

In [None]:
y_pred = Logic_Reg.predict(x_test)
def acc_test(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)

    return print("Accuracy:",acc,"\n F1 Score:",f1,"\n Precision:",prec,"\n Recall:",rec)

In [None]:
acc_test(y_test,y_pred)

In [None]:
cross_acc = cross_val_score(estimator=Logic_Reg,X=x_train,y=y_train,cv=10)


In [None]:
print("Accuracy is {:.2f} %".format(cross_acc.mean()*100))
print("Standard Deviation is {:.2f} %".format(cross_acc.std()*100))

## **Prediction**

In [None]:
df.head()

In [None]:
tes_ob = [[11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173]]

In [None]:
Logic_Reg.predict(SS.transform(tes_ob))

In [None]:
def cancer_pred(test):
    if Logic_Reg.predict(SS.transform(test))==1:
        return "Malignant"
    else: return "Benign"

In [None]:
cancer_pred(tes_ob)