# Breast Cancer Prediction

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split    
from sklearn.model_selection import GridSearchCV

In [3]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
import warnings
warnings.filterwarnings('ignore')

### Load the datasets

In [6]:
xls = pd.ExcelFile('./Dataset.xlsx')
pathogenic_df = pd.read_excel(xls, 'Pathogenic',skiprows=[0])
benign_df = pd.read_excel(xls, 'Benign',skiprows=[0])

In [7]:
pathogenic_df.head()

Unnamed: 0,Brca1_Chromosome,Brca1_Reference,Brca1_Alternate,Brca2_Chromosome,Brca2_Reference,Brca2_Alternate,ATM_Chromosome,ATM_Reference,ATM_Alternate
0,17,C,T,13,G,A,11,T,C
1,17,A,G,13,G,A,11,G,A
2,17,G,C,13,GAT,G,11,G,A
3,17,A,G,13,AC,A,11,C,T
4,17,T,C,13,G,T,11,CA,C


In [8]:
benign_df.head()

Unnamed: 0,Brca1_Chromosome,Brca1_Reference,Brca1_Alternate,Brca2_Chromosome,Brca2_Reference,Brca2_Alternate,ATM_Chromosome,ATM_Reference,ATM_Alternate
0,17,G,C,13,C,G,11,A,G
1,17,G,C,13,T,C,11,A,G
2,17,T,G,13,C,T,11,G,A
3,17,A,C,13,C,T,11,G,T
4,17,G,T,13,A,G,11,G,C


## Checking Null Values

In [9]:
pathogenic_df.isna().sum()

Brca1_Chromosome    0
Brca1_Reference     0
Brca1_Alternate     0
Brca2_Chromosome    0
Brca2_Reference     0
Brca2_Alternate     0
ATM_Chromosome      0
ATM_Reference       0
ATM_Alternate       0
dtype: int64

In [10]:
benign_df.isna().sum()

Brca1_Chromosome    0
Brca1_Reference     0
Brca1_Alternate     0
Brca2_Chromosome    0
Brca2_Reference     0
Brca2_Alternate     0
ATM_Chromosome      0
ATM_Reference       0
ATM_Alternate       0
dtype: int64

In [11]:
pathogenic_df.shape, benign_df.shape

((3205, 9), (3205, 9))

## Feature Engineering

### Pathogenic

In [12]:
pathogenic_df.columns

Index(['Brca1_Chromosome', 'Brca1_Reference', 'Brca1_Alternate',
       'Brca2_Chromosome', 'Brca2_Reference', 'Brca2_Alternate',
       'ATM_Chromosome', 'ATM_Reference', 'ATM_Alternate'],
      dtype='object')

In [13]:
pathogenic_df = pathogenic_df[pathogenic_df['Brca1_Reference'].str.len() == 1]

In [14]:
pathogenic_df = pathogenic_df[pathogenic_df['Brca1_Alternate'].str.len() == 1]

In [15]:
pathogenic_df = pathogenic_df[pathogenic_df['Brca2_Reference'].str.len() == 1]

In [16]:
pathogenic_df = pathogenic_df[pathogenic_df['Brca2_Alternate'].str.len() == 1]

In [17]:
pathogenic_df = pathogenic_df[pathogenic_df['ATM_Reference'].str.len() == 1]

In [18]:
pathogenic_df = pathogenic_df[pathogenic_df['ATM_Alternate'].str.len() == 1]

In [19]:
pathogenic_df['ATM_Alternate'].str.len().value_counts()

1    2321
Name: ATM_Alternate, dtype: int64

In [20]:
pathogenic_df.shape

(2321, 9)

In [21]:
pathogenic_df

Unnamed: 0,Brca1_Chromosome,Brca1_Reference,Brca1_Alternate,Brca2_Chromosome,Brca2_Reference,Brca2_Alternate,ATM_Chromosome,ATM_Reference,ATM_Alternate
0,17,C,T,13,G,A,11,T,C
1,17,A,G,13,G,A,11,G,A
5,17,G,C,13,G,A,11,C,T
6,17,C,T,13,G,A,11,G,A
9,17,G,A,13,A,C,11,C,G
...,...,...,...,...,...,...,...,...,...
3191,17,G,T,13,A,G,11,C,T
3193,17,T,A,13,A,G,11,C,T
3194,17,T,C,13,C,T,11,G,A
3199,17,G,T,13,A,G,11,C,T


### Benign

In [22]:
benign_df.columns

Index(['Brca1_Chromosome', 'Brca1_Reference', 'Brca1_Alternate',
       'Brca2_Chromosome', 'Brca2_Reference', 'Brca2_Alternate',
       'ATM_Chromosome', 'ATM_Reference', 'ATM_Alternate'],
      dtype='object')

In [23]:
benign_df = benign_df[benign_df['Brca1_Reference'].str.len() == 1]

In [24]:
benign_df = benign_df[benign_df['Brca1_Alternate'].str.len() == 1]

In [25]:
benign_df = benign_df[benign_df['Brca2_Reference'].str.len() == 1]

In [26]:
benign_df = benign_df[benign_df['Brca2_Alternate'].str.len() == 1]

In [27]:
benign_df = benign_df[benign_df['ATM_Reference'].str.len() == 1]

In [28]:
benign_df = benign_df[benign_df['ATM_Alternate'].str.len() == 1]

In [29]:
benign_df.shape

(2609, 9)

### Adding result column

In [30]:
benign_df['result'] = 0
pathogenic_df['result'] = 1 

### Merging dataset

In [31]:
df = pd.concat([benign_df,pathogenic_df],ignore_index=True)

In [32]:
df.shape

(4930, 10)

### Separating Dependent and Independent Variables 

In [33]:
X = df.drop('result',axis=1)
y = df['result']

## Train Test Split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True )

## Converting Categorical Variable into Numerical

In [35]:
ohe = OneHotEncoder()

In [36]:
ohe.fit(X_train)

OneHotEncoder()

In [37]:
ohe_X_train = ohe.transform(X_train)

In [38]:
ohe_X_test = ohe.transform(X_test)

In [39]:
ohe.categories_

[array([17], dtype=int64),
 array(['A', 'C', 'G', 'T'], dtype=object),
 array(['A', 'C', 'G', 'T'], dtype=object),
 array([13], dtype=int64),
 array(['A', 'C', 'G', 'T'], dtype=object),
 array(['A', 'C', 'G', 'T'], dtype=object),
 array([11], dtype=int64),
 array(['A', 'C', 'G', 'T'], dtype=object),
 array(['A', 'C', 'G', 'T'], dtype=object)]

## Model Training

In [40]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [41]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(ohe_X_train.toarray(), y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
score_df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
score_df

Unnamed: 0,model,best_score,best_params
0,svm,0.573781,"{'C': 20, 'kernel': 'rbf'}"
1,random_forest,0.552735,{'n_estimators': 5}
2,logistic_regression,0.557303,{'C': 1}
3,naive_bayes_gaussian,0.558576,{}
4,naive_bayes_multinomial,0.559842,{}
5,decision_tree,0.548422,{'criterion': 'entropy'}


In [44]:
model = svm.SVC(gamma='auto',C=20,kernel='rbf')

In [45]:
model.fit(ohe_X_train,y_train)

SVC(C=20, gamma='auto')

In [46]:
prediction = model.predict(ohe_X_test)

In [47]:
print('Accuracy Score : ',accuracy_score(y_test,prediction)*100,"%")

Accuracy Score :  53.85395537525355 %


## Training on Complete Dataset

In [48]:
ohe_X = ohe.transform(X)

In [49]:
model = svm.SVC(gamma='auto',C=20,kernel='rbf')

In [50]:
model.fit(ohe_X,y)

SVC(C=20, gamma='auto')

## Model Saving

In [51]:
import joblib

In [52]:
joblib.dump(model,'model.pkl')

['model.pkl']

In [53]:
joblib.dump(ohe,'encoder.pkl')

['encoder.pkl']

## Prediction Function

In [54]:
for col in X.columns:
    print(col," \t", X[col].unique())

Brca1_Chromosome  	 [17]
Brca1_Reference  	 ['G' 'T' 'A' 'C']
Brca1_Alternate  	 ['C' 'G' 'T' 'A']
Brca2_Chromosome  	 [13]
Brca2_Reference  	 ['C' 'T' 'A' 'G']
Brca2_Alternate  	 ['G' 'C' 'T' 'A']
ATM_Chromosome  	 [11]
ATM_Reference  	 ['A' 'G' 'T' 'C']
ATM_Alternate  	 ['G' 'A' 'T' 'C']


In [58]:
X_test

Unnamed: 0,Brca1_Chromosome,Brca1_Reference,Brca1_Alternate,Brca2_Chromosome,Brca2_Reference,Brca2_Alternate,ATM_Chromosome,ATM_Reference,ATM_Alternate
1288,17,G,A,13,G,A,11,A,G
983,17,C,A,13,T,G,11,G,C
4319,17,G,A,13,G,A,11,C,T
4544,17,C,A,13,G,T,11,T,C
3905,17,A,G,13,T,C,11,A,G
...,...,...,...,...,...,...,...,...,...
1522,17,C,T,13,C,T,11,G,A
528,17,A,G,13,G,A,11,T,C
1984,17,C,T,13,G,A,11,C,T
235,17,A,C,13,A,G,11,G,C


In [56]:
model.predict(ohe_X_test)

array([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,