**Importing the Dependencies**

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

**About Dataset**

The data set contains patient records from a 1984-1989 trial conducted by the German Breast Cancer Study Group (GBSG) of 720 patients with node positive breast cancer; it retains the 686 patients with complete data for the prognostic variables.

These data sets are used in the paper by Royston and Altman(2013). The Rotterdam data is used to create a fitted model, and the GBSG data for validation of the model. The paper gives references for the data source.

**Dataset Format**

A data set with 686 observations and 11 variables.

Columns    |   	Description
-----------|---------------
pid	       | patient identifier
age	       | age, years
meno	     | menopausal status (0= premenopausal, 1= postmenopausal)
size	     | tumor size, mm
grade	     | tumor grade
nodes	     | number of positive lymph nodes
pgr	       | progesterone receptors (fmol/l)
er	       | estrogen receptors (fmol/l)
hormon	   | hormonal therapy, 0= no, 1= yes
rfstime	   | recurrence free survival time; days to first of recurrence, death or last follow-up
status	   | 0= alive without recurrence, 1= recurrence or death

**References**

Patrick Royston and Douglas Altman, External validation of a Cox prognostic model: principles and methods. BMC Medical Research Methodology 2013, 13:33

**Data Collection and Processing**

In [14]:
# loading the csv data to a Pandas DataFrame
data = pd.read_csv('gbsg.csv')

In [15]:
# print first 5 rows of the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status
0,1,132,49,0,18,2,2,0,0,0,1838,0
1,2,1575,55,1,20,3,16,0,0,0,403,1
2,3,1140,56,1,40,3,3,0,0,0,1603,0
3,4,769,45,0,25,3,1,0,4,0,177,0
4,5,130,65,1,30,2,5,0,36,1,1855,0


In [16]:
# number of rows and columns in the dataset
data.shape

(686, 12)

In [17]:
data.columns

Index(['Unnamed: 0', 'pid', 'age', 'meno', 'size', 'grade', 'nodes', 'pgr',
       'er', 'hormon', 'rfstime', 'status'],
      dtype='object')

In [18]:
# getting some info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Unnamed: 0  686 non-null    int64
 1   pid         686 non-null    int64
 2   age         686 non-null    int64
 3   meno        686 non-null    int64
 4   size        686 non-null    int64
 5   grade       686 non-null    int64
 6   nodes       686 non-null    int64
 7   pgr         686 non-null    int64
 8   er          686 non-null    int64
 9   hormon      686 non-null    int64
 10  rfstime     686 non-null    int64
 11  status      686 non-null    int64
dtypes: int64(12)
memory usage: 64.4 KB


In [19]:
# removing less relevant parameters from the dataset.
data = data.drop(columns={'Unnamed: 0','pid'},axis=1)

In [20]:
# checking for missing values
data.isnull().sum()

age        0
meno       0
size       0
grade      0
nodes      0
pgr        0
er         0
hormon     0
rfstime    0
status     0
dtype: int64

In [21]:
# checking number of rows and columns in the dataset again
data.shape

(686, 10)

In [22]:
# statistical measures about the data
data.describe()

Unnamed: 0,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status
count,686.0,686.0,686.0,686.0,686.0,686.0,686.0,686.0,686.0,686.0
mean,53.052478,0.577259,29.329446,2.116618,5.010204,109.995627,96.252187,0.358601,1124.489796,0.43586
std,10.120739,0.494355,14.296217,0.582808,5.475483,202.331552,153.083963,0.47994,642.791948,0.496231
min,21.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,8.0,0.0
25%,46.0,0.0,20.0,2.0,1.0,7.0,8.0,0.0,567.75,0.0
50%,53.0,1.0,25.0,2.0,3.0,32.5,36.0,0.0,1084.0,0.0
75%,61.0,1.0,35.0,2.0,7.0,131.75,114.0,1.0,1684.75,1.0
max,80.0,1.0,120.0,3.0,51.0,2380.0,1144.0,1.0,2659.0,1.0


In [23]:
# checking the distribution of Target Variable
data['status'].value_counts()

status
0    387
1    299
Name: count, dtype: int64

0 --> alive without recurrence

1 --> recurrence or death

**Splitting the Features and Target**

In [24]:
X = data.drop(columns='status', axis=1)
Y = data['status']

In [25]:
print(X)

     age  meno  size  grade  nodes   pgr    er  hormon  rfstime
0     49     0    18      2      2     0     0       0     1838
1     55     1    20      3     16     0     0       0      403
2     56     1    40      3      3     0     0       0     1603
3     45     0    25      3      1     0     4       0      177
4     65     1    30      2      5     0    36       1     1855
..   ...   ...   ...    ...    ...   ...   ...     ...      ...
681   51     0    30      3      2  1152    38       1     1760
682   64     1    26      2      2  1356  1144       1     1152
683   57     1    35      3      1  1490   209       1     1342
684   44     0    21      2      3  1600    70       0      629
685   80     1     7      2      7  2380   972       1      758

[686 rows x 9 columns]


In [26]:
print(Y)

0      0
1      1
2      0
3      0
4      0
      ..
681    0
682    0
683    0
684    0
685    0
Name: status, Length: 686, dtype: int64


**Splitting the Data into Training data & Test Data**

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=2)

In [28]:
print(X.shape, X_train.shape, X_test.shape)

(686, 9) (617, 9) (69, 9)


**Model Training**

*Logistic Regression*

In [29]:
model = LogisticRegression()

In [30]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

*Naive Bayes*

In [31]:
gnb = GaussianNB()

In [32]:
# training the Naive Bayes model with Training data
gnb.fit(X_train, Y_train)

*Decision Tree Classifier*

In [33]:
criterions = ['gini', 'entropy']
parameters = dict(criterion=criterions)
dtc = GridSearchCV(
    DecisionTreeClassifier(), parameters, cv=5, scoring='accuracy'
)
dtc.fit(X, Y.ravel())
dtc_opt = dtc.best_estimator_
print(dtc.best_params_)
print(dtc.best_score_)

{'criterion': 'gini'}
0.50880143869671


In [34]:
dtc = DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, Y_train.ravel())
dtc_pred = dtc.predict(X_test)
score = accuracy_score(dtc_pred, Y_test)
print(score)

0.6666666666666666


*Random Forest Classifier*

In [35]:
parameters = {
    'n_estimators': [10, 100, 250, 500]
}
rfc = GridSearchCV(
    RandomForestClassifier(), parameters, cv=5, scoring='accuracy'
)
rfc.fit(X, Y.ravel())
rfc_opt = rfc.best_estimator_
print(rfc.best_params_)
print(rfc.best_score_)

{'n_estimators': 500}
0.49134666243520575


In [36]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, Y_train.ravel())
rfc_pred = rfc.predict(X_test)
score = accuracy_score(rfc_pred, Y_test)
print(score)

0.7971014492753623


*Support Vector Machine*

In [37]:
#svm_model = svm.SVC(kernel='linear')

In [38]:
# training the SVM model with training data
#svm_model.fit(X_train, Y_train)

**Model Evaluation**

**Accuracy Score**

In [39]:
# accuracy on training data in Logistic Regression
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [40]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.7147487844408428


In [41]:
# accuracy on test data in Logistic Regression
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [42]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.7681159420289855


In [43]:
# accuracy on training data in Naive Bayes
X_tr_predict_nb = gnb.predict(X_train)
training_data_accuracy_nb = accuracy_score(X_tr_predict_nb, Y_train)

In [44]:
print('Accuracy on Training data : ', training_data_accuracy_nb)

Accuracy on Training data :  0.6920583468395461


In [45]:
# accuracy on test data in Naive Bayes
X_test_predict_nb = gnb.predict(X_test)
test_data_accuracy_nb = accuracy_score(X_test_predict_nb, Y_test)

In [46]:
print('Accuracy on Test data : ', test_data_accuracy_nb)

Accuracy on Test data :  0.6811594202898551


In [47]:
# accuracy on training data in Decision Tree Classifier
X_train_prediction_dtc = dtc.predict(X_train)
training_data_accuracy_dtc = accuracy_score(X_train_prediction_dtc, Y_train)

In [48]:
print('Accuracy on Training data : ', training_data_accuracy_dtc)

Accuracy on Training data :  1.0


In [49]:
# accuracy on testing data in Decision Tree Classifier
X_test_prediction_dtc = dtc.predict(X_test)
test_data_accuracy_dtc = accuracy_score(X_test_prediction_dtc, Y_test)

In [50]:
print('Accuracy on Test data : ', test_data_accuracy_dtc)

Accuracy on Test data :  0.6666666666666666


In [51]:
# accuracy on training data in Random Forest Classifier
X_train_prediction_rfc = rfc.predict(X_train)
training_data_accuracy_rfc = accuracy_score(X_train_prediction_rfc, Y_train)

In [52]:
print('Accuracy on Training data : ', training_data_accuracy_rfc)

Accuracy on Training data :  1.0


In [53]:
# accuracy on testing data in Random Forest Classifier
X_test_prediction_rfc = rfc.predict(X_test)
test_data_accuracy_rfc = accuracy_score(X_test_prediction_rfc, Y_test)

In [54]:
print('Accuracy on Test data : ', test_data_accuracy_rfc)

Accuracy on Test data :  0.7971014492753623


In [55]:
# accuracy score on testing data in SVM
#X_test_prediction_svm = svm_model.predict(X_test)
#test_data_accuracy_svm = accuracy_score(X_test_prediction_svm, Y_test)

In [56]:
#print('Accuracy score of test data : ', test_data_accuracy_svm)

So, It's better to work with Random Forest Classifier in this case.

Building a Predictive System

In [57]:
input_data = (61,1,50,2,4,10,10,0,2456)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = rfc.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('alive without recurrence')
else:
  print('recurrence or death')

[1]
recurrence or death


**Saving The Trained Model**

In [58]:
import pickle

In [59]:
file = 'trained_model_breast_cancer.sav'
pickle.dump(rfc, open(file, 'wb'))

In [60]:
#loading the saved model
loaded_model = pickle.load(open('trained_model_breast_cancer.sav', 'rb'))

In [61]:
input_data = (61,1,50,2,4,10,10,0,2456)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('alive without recurrence')
else:
  print('recurrence or death')

[1]
recurrence or death
