In [None]:
#Business Problem :-
#There are lot of assumptions in the dignosis pertaining to concern. In a few radialogists,pythologists & ancologists go
#wrong in diagnosing whether tumor is benign(non-cancerous) or maligant(cancerous).Hence team of physicia wants us to
#build an AI application which will predict with confidence the presence of cancer in a patient. This will serve as a 
#compliment to the physicians.

# Business Objectives :- Maximize Cancer Detection.
# Business Constraints:- Minimize Treatment cost & Maximize patient convenience.

#Success Criteria :- 
# 1. Business Success Criteria :- Increase the correct diagnosis of cancer in atleast 96% of patients.
# 2. Machine Learning Success Criteria :- Achieve an accuracy of atleast 98%.
# 3. Economic Success Criteria :- Reducing Medical expenses will improve trust of patients & theryby hospitals will see 
#                                 an increase in revenue by atleast 12%.

# Data Collection :- 
#                  Data is collected from the hospitals for 569 patients. 30 features & 1 label comprise the feature set.The
#  red-valued features are computed for each cell nucleus :-
# a] Radius[Mean of distance from center to points on the perimeter]
# b] Texture [Standard deviation of gray-scale values]
# c] Perimeter
# d] Area
# E] Smoothness.[Local Variation]
# F] Compactness[Perimeter ^2/area-1.0]
# G] Concavity[Severity of concave portions of the counter]
# H] Concave Point [Number of concave portions of the counter] 
# I] Symmetry
# J] Fractal dimension["Coastline Approximation"-1]  

In [None]:
#Import the required liabraries :-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from feature_engine.outliers import Winsorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

import sklearn.metrics as skmet
import pickle


In [None]:
# Postgre SQL :- Psycopg2 it is a "PstgreSQL database" driver it is used to perform operations on PostgreSQL using 
# Python ,it is designed for multi-thraded applictions.

import psycopg2
from sqlalchemy import create_engine



In [None]:
# To read the data :-
can = pd.read_csv(r"D:\Cancer Data\Cancer_Data.csv")
can

In [None]:
# Data Preprocessing and EDA :- 
# Converting B to Benign and M to Malignant.

can['diagnosis'] = np.where(can['diagnosis'] == 'B','Benign',can['diagnosis'])

can['diagnosis'] = np.where(can['diagnosis'] == 'M', 'Malignant',can['diagnosis'])
can

In [None]:
can.drop(['id'],axis = 1,inplace = True)  #Excluding Id Columns
can.info()

In [None]:
can.describe()

In [None]:
#Remove Object Data Type :-
numeric_features = can.select_dtypes(exclude = ['object']).columns
numeric_features

In [None]:
num_pipeline = Pipeline(['impute',SimpleImputer(strategy = 'mean')])

In [None]:
#Encoding Categorical to numeric variable :-
categorical_features = ['sex']
categorical_features

In [None]:
#DataFrameMapper is used to map the given Attribute.
categ_pipeline = Pipeline([
    ('Label', DataFrameMapper([([cat_feature], OneHotEncoder(drop='if_binary')) for cat_feature in categorical_features]))
])

In [None]:
# Using columntransfer to transform the columns of an array or pandas Dataframe.This estimator allows different.

preprocess_pipeline = ColumnTransformer([
    ('categorical', categ_pipeline, categorical_features),
    ('numerical', num_pipeline, numeric_features)
])

In [None]:
processed = preprocess_pipeline.fit_transform(can)  #Pass the raw data through pipeline
processed

In [None]:
import joblib
joblib.dump(processed,'processed 1')

In [None]:
import os
os.getcwd()

In [None]:
#Clean & Processed Data for clustering:-

can = pd.DataFrame(processed.transform(can))
can

can.columns
can.info()

In [None]:
#Captures only numeric data.If in newcase we have any non-numeric columns, we can skip them through.
new_features = can.select_dtypes(exclude = ['object']).columns

new_features

In [None]:
scale_pipeline = Pipeline([('scale',MinMaxScaler())])

preprocess_pipeline2 = ColumnTransformer([('scale',scale_pipeline,new_features)],
                                         reminde = 'passthrough') # skip the transformation for remaining columns.

processed2 = preprocess_pipeline2.fit(can)

processed2

In [None]:
joblib.dump(processed2,'processed2')

In [None]:
import os
os.getcwd

In [None]:
#Normalize data frame[considering the numerical part of data]

can_n = pd.DataFrame(processed2.transform(can))

can_n.describe()

In [None]:
#Seperating the input & output from the dataset :-

X = np.array(can_n.iloc[:,:]) # Predictors

Y = np.array(can['diagnosis']) # Target

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 0)

X_train.shape
Y_train.shape

In [None]:
# Model Building :-

knn = KNeighborsClassifier(n_neighbors = 21)
KNN = knn.fit(X_train,Y_train) #Train the KNN Model. 

In [None]:
#Evaluate the model with train data :-

pred_train = knn.predict(X_train) #Predict on train data.
pred_train

In [None]:
#Cross Table:-
pd.crosstab(Y_train,pred_train,rownames = ['Actual'],columns = ['Predictions'])

print(skmet.accuracy_score(Y_train,pred_train))

In [None]:
#Predict the classon test data :- 
pred = knn.predict(X_test)
pred

In [None]:
#Evaluate the model with test data :-
print(skmet.accuracy_score(Y_test,pred))
pd.crosstab(Y_test,pred,rownames = ['Actual'],columns = ['Predictions'])

cm = skmet.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels = ['Benign','Malignant'])
cmplot.plot()

cmplot.ax_.set(title = 'Cancer Detection - Confusion Matrix',xlabel = 'Predicted Value',ylabel = 'Actual Value')

In [None]:
# Creating Empty list Variable:-
acc =[]

In [None]:
#Running KNN algorithm for 3 to 50 nearest neighbours(oddnumbers)& storing the accuracy values:-{3=start,50=end,2=encrement}

for i in range(3,50,2):
    neigh = KNeighborsClassifier(n_neighbors = i)
    neigh.fit(X_train,Y_train)
    train_acc = np.mean(neigh.predict(X_train) == Y_train)
    test_acc = np.mean(neigh.predict(X_test) == Y_test)
    diff = train_acc-test_acc
    acc.append([diff,train_acc,test_acc])

    acc   

In [None]:
# Train data Accuracy Plot :-
plt.plot(np.arrange(3,50,2),[i[1]for i in acc],"ro-")

# Test data Accuracy plot:-
plt.plot(np.arrange(3,50,2),[i[2]for i in acc],"bo-")

In [None]:
from sklearn.model_selection import GridSearchCV
help(knn)

In [None]:
K_range = list(range(3,50,2))
param_grid = dict(n_neighbours = K_range)

In [None]:
# Defining Parameter Range :-
grid = GridSearchCV(knn,param_grid,cv = 5, scoring = 'accuracy',return_train_score = False, verbose = 1)

knn_new = grid.fit(X_train,Y_train)
print(knn_new.best_params_)

accuracy = knn_new.best_score_*100
print("Accuracy for our training dataset with tuning is:{.2f%}.format(accuracy)")

In [None]:
# Predict the class on test data :-

pred = knn_new.predict(X_test)
pred

cm = skmet.confusion_matrix(Y_test, pred)

In [None]:
cmplot = skmet.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels = ['Benign','Malignant'])

cmplot.plot()

cmplot.ax_.set(title = 'Cancer Detection - Confusion Matrix',xlabel = 'Predicted Value',ylabel = 'Actual Value')

In [None]:
# Save The Model :-
knn_best = KNN_new.best_estimator_
pickle.dump(knn_best,open('knn.pkl','wb'))

In [None]:
import os
os.getcwd()

In [None]:
# Load a Saved Model :-
model = pickle.load(open('knn.pkl','rd'))