In [3]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
train_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses']
target_header = ['tumor_class']

X = wins_data[train_headers]
y = wins_data[target_header]
#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#instantiate the model
my_model = DecisionTreeClassifier()

#train the model to fit
my_model.fit(X_train, y_train)
#now let's predict the model
y_pred_train = my_model.predict(X_train)

y_pred = my_model.predict(X_test)

#compute the train accuracy
model_acc = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train data: {:.2f}".format(model_acc), '\n')


#compute test set accuracy
model_accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on Test data: {:.2f}".format(model_accuracy), '\n')

#constructing a confusion matrix  of the test data
matrix_info = confusion_matrix(y_test, y_pred)
print("The Confusion Matrix: \n", matrix_info, '\n')

#construct the classification report
class_report = classification_report(y_test, y_pred)
print("Report of classification: \n", class_report)

(699, 11)
(699, 10)

 The total of training dataset (419, 9)

 The total of test dataset (280, 9)
Model accuracy on Train data: 1.00 

Model accuracy on Test data: 0.94 

The Confusion Matrix: 
 [[180   3]
 [ 14  83]] 

Report of classification: 
               precision    recall  f1-score   support

         2.0       0.93      0.98      0.95       183
         4.0       0.97      0.86      0.91        97

    accuracy                           0.94       280
   macro avg       0.95      0.92      0.93       280
weighted avg       0.94      0.94      0.94       280

