<a href="https://colab.research.google.com/github/charangowdamn/pythonproject/blob/main/pythonproject/Breast_cancer_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Breast Cancer Detection Capstone Project

 #Importing Libraries
"""

# Commented out IPython magic to ensure Python compatibility.
import numpy as np
from sklearn.datasets import load_breast_cancer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# %matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

"""### Loading Data into variable"""

breast_cancer=load_breast_cancer()
breast_cancer

breast_cancer.DESCR

""" Storing features in x"""

x=breast_cancer.data
x

""" Storing labels in y"""

y=breast_cancer.target
y

""" Printing names of features"""

breast_cancer.feature_names

""" Checking Dimensions of x and y"""

print(x.shape,y.shape)

""" Storing data as pandas dataframe to perform further operations """

data=pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
data['class']=breast_cancer.target

"""Printing the head of the dataset to have a look at the dataframe"""

#Top 10 values in the data
data.head(10)

"""Having a look at details of the data"""

#Basic Descriptive statistics on the data
data.describe(include="all")

#DATA VISUALIZATION

### Plotting a pair plot to see correlations between the independent and dependent variables
This is used to show the numeric distribution in the scatter plot.

g=sns.pairplot(data,hue='class')
g.fig.set_size_inches(25,25)

""" Pair plot of sample feature of DataFrame
The pair plot showing malignant and benign tumor data distributed in two classes. It is easy to differentiate in the pair plot.


"""

g=sns.pairplot(data,hue='class',vars = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness'])
g.fig.set_size_inches(25,25)

"""Counterplot
Showing the total count of malignant and benign tumor patients in counterplot.
"""

sns.countplot(data['class'])

""">In the below counterplot max samples mean radius is equal to 1."""

# counter plot of feature mean radius
plt.figure(figsize = (20,8))
sns.countplot(data['mean radius'])

# Heatmap
# Heatmap of breast cancer DataFrame
#In the below heatmap we can see the variety of different feature’s value. The value of feature ‘mean area’ and ‘worst area’ are greater than other and ‘mean perimeter’, ‘area error’, and ‘worst perimeter’ value slightly less but greater than remaining features.


plt.figure(figsize=(18,14))
h=sns.heatmap(data)


#To find a correlation between each feature and target we visualize heatmap using the correlation matrix.

data.corr()

plt.figure(figsize=(20,20))
sns.heatmap(data.corr(), annot = True, cmap ='coolwarm', linewidths=2)

#As we can see many of the features are highly correlated so we will find them out and drop them

#Function to find correlated values
def correlation(data,threshold):
  #using a set to avoid getting the same column name twice.
  col_corr=set()
  corr_matrix=data.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
      if (corr_matrix.iloc[i,j]) > threshold:
        #Absolute value can also be taken to identify negatively correlated features using abs function
        colname=corr_matrix.columns[i]
        col_corr.add(colname)
  return col_corr

d=data.drop('class',axis=1)
columnname=correlation(d,0.95)

columnname

len(columnname)

x=d.drop(columnname,axis=1)
y=data['class']

x

#Checking Number of benign and malignant cases

print(data['class'].value_counts())

print(breast_cancer.target_names)

data.groupby('class').mean()

"""1 - Malignant  0 - Benign """
#Standardisation on independent variables

print(x)

scale=StandardScaler()
x=scale.fit_transform(x)
x

"""#### Dividing data into train and test data using sklearn's train_test_split()"""

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=42,stratify=y)

#checking dimension of target
print(y.shape, y_train.shape, y_test.shape)

#checking dimension of features
print(x.shape, x_train.shape, x_test.shape)

#checking mean of target
print(y.mean(), y_train.mean(), y_test.mean())

#checking mean of features
print(x.mean(), x_train.mean(), x_test.mean())

x_train

# Our data is ready to be applied a machine learning algorithm

# Logistic Regression

# Implementing Logistic Regression without scaling


clf=LogisticRegression().fit(x_train,y_train)

#Predicting seen data with our Logistic Regression ML model

train_pred=clf.predict(x_train)
train_pred
### #Predicting unseen data with our Logistic Regression ML model

test_pred=clf.predict(x_test)
test_pred
# Checking Accuracy of Logistic Regression ML model with training data and testing data

train_accuracy=accuracy_score(y_train,train_pred)

test_accuracy=accuracy_score(y_test,test_pred)

print("Training accuracy: ",train_accuracy)
print("Testing accuracy: ",test_accuracy)
# Logistic Regression ML model Score

clf.score(x_train,y_train)

clf.score(x_test,y_test)

"""Lets Cross Validate and Check how the model performs."""

#cross validation
print(cross_val_score(clf, x, y, cv=5))

# Logistic Regression Model Has Performed Wonderfully After Feature Selection


# SVM

# Implementing Support Vector Machine Algorithm


clf_svm=SVC(kernel='rbf').fit(x_train,y_train)

# Predicting seen data with our SVM ML model

svmtrain_pred=clf_svm.predict(x_train)
svmtrain_pred

# Predicting unseen data with our SVM ML model

svmtest_pred=clf_svm.predict(x_test)
svmtest_pred

# Checking Accuracy of SVM ML model with training data and testing data

svmtrain_accuracy=accuracy_score(y_train,svmtrain_pred)
svmtest_accuracy=accuracy_score(y_test,svmtest_pred)

print("Training accuracy: ",svmtrain_accuracy)
print("Testing accuracy: ",svmtest_accuracy)

# SVM ML MODEL SCORE

clf_svm.score(x_train,y_train)

clf_svm.score(x_test,y_test)

#Cross validation
print(cross_val_score(clf_svm, x, y, cv=5))


# Implementing K Nearest neighbors classifier


clf_knn=KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)

#Predicting Seen data
trainknn_pred=clf_knn.predict(x_train)
trainknn_pred

#Predicting Unseen Data
testknn_pred=clf_knn.predict(x_test)
testknn_pred

# Checking Accuracy of KNN ML model with training data and testing data

trainknn_accuracy=accuracy_score(y_train,trainknn_pred)
testknn_accuracy=accuracy_score(y_test,testknn_pred)

print("Training accuracy: ",trainknn_accuracy)
print("Testing accuracy: ",testknn_accuracy)

# KNN ML MODEL SCORE

clf_knn.score(x_train,y_train)

clf_knn.score(x_test,y_test)

#cross validation
print(cross_val_score(clf_knn, x, y, cv=5))

# Comparing Out of Sample Accuracies"

acc_dict={'LogisticRegression':test_accuracy,'SVM':svmtest_accuracy,'KNN':testknn_accuracy}

acc_dict



# Save the model to disk (serialize model on disk)
import pickle
pickle.dump(clf_svm, open("/content/drive/MyDrive/Projects/Breast Cancer Detection Capstone/model.pkl", 'wb'))
