In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('breast-cancer.csv')

df2 = df[df['node-caps'] != '?']
clean_data = df2[df2['breast-quad'] != '?']

clean_data.head()
clean_data['age'].value_counts(sort=True)

50-59    91
40-49    89
60-69    55
30-39    36
70-79     5
20-29     1
Name: age, dtype: int64

In [3]:
from sklearn import preprocessing

# Taking each categorical column and Label encoding for ML processing
age_le = preprocessing.LabelEncoder()
age_le.fit(clean_data['age'])
age_transform = age_le.transform(clean_data['age'])

meno_le = preprocessing.LabelEncoder()
meno_le.fit(clean_data['menopause'])
meno_transform = meno_le.transform(clean_data['menopause'])

tumor_le = preprocessing.LabelEncoder()
tumor_le.fit(clean_data['tumor-size'])
tumor_transform = tumor_le.transform(clean_data['tumor-size'])

inv_le = preprocessing.LabelEncoder()
inv_le.fit(clean_data['inv-nodes'])
inv_transform = inv_le.transform(clean_data['inv-nodes'])

node_le = preprocessing.LabelEncoder()
node_le.fit(clean_data['node-caps'])
node_transform = node_le.transform(clean_data['node-caps'])

breast_le = preprocessing.LabelEncoder()
breast_le.fit(clean_data['breast'])
breast_transform = breast_le.transform(clean_data['breast'])

breastq_le = preprocessing.LabelEncoder()
breastq_le.fit(clean_data['breast-quad'])
breastq_transform = breastq_le.transform(clean_data['breast-quad'])

irradiat_le = preprocessing.LabelEncoder()
irradiat_le.fit(clean_data['irradiat'])
irradiat_transform = irradiat_le.transform(clean_data['irradiat'])



In [4]:
# transformed categorical data sets combined with deg-malig to create final data set for ML
ds_transform = pd.DataFrame(
    {'age': age_transform,
     'menopause': meno_transform,
     'tumor-size': tumor_transform,
     'inv-nodes': inv_transform,
     'node-caps': node_transform,
     'deg-malig': clean_data['deg-malig'],
     'breast': breast_transform,
     'breast-quad': breastq_transform,
     'irradiat': irradiat_transform,
     'Class': clean_data['Class'],
    })

In [5]:
# Move Class column to the end
cols = list(ds_transform.columns.values)
cols.pop(cols.index('Class'))
ds_transform = ds_transform[cols+['Class']]
ds_transform

Unnamed: 0,age,breast,breast-quad,deg-malig,inv-nodes,irradiat,menopause,node-caps,tumor-size,Class
0,2,1,2,3,0,0,2,1,2,recurrence-events
1,3,1,0,1,0,0,0,0,2,no-recurrence-events
2,3,0,1,2,0,0,0,0,6,recurrence-events
3,2,1,1,3,0,1,2,1,6,no-recurrence-events
4,2,0,4,2,4,0,2,1,5,recurrence-events
5,3,1,2,2,4,1,2,0,4,no-recurrence-events
6,3,0,2,3,0,0,0,0,7,no-recurrence-events
7,2,0,2,2,0,0,2,0,1,no-recurrence-events
8,2,1,3,2,0,0,2,0,0,no-recurrence-events
9,2,1,2,2,2,1,0,1,7,no-recurrence-events


In [6]:
# split data set into attributes and target
from sklearn import tree, linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from matplotlib import pyplot as plt

X = ds_transform.values[:, 0:9]
y = ds_transform.values[:,9]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5) 


In [7]:
# decision tree classifier 
from sklearn import tree 

dt = tree.DecisionTreeClassifier(criterion='entropy', random_state=50)
dt = dt.fit(X_train, y_train)

In [8]:
# decision tree prediction
y_pred = dt.predict(X_test)
y_pred
print('Accuracy is ', accuracy_score(y_test,y_pred)*100)

Accuracy is  67.3913043478


In [9]:
# knn classifier accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=125) 
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_train, y_train)

In [10]:
# knn accuracy
knn_y_pred = knn.predict(X_test)
knn_y_pred
print('Accuracy is ', accuracy_score(y_test,knn_y_pred)*100)

Accuracy is  75.6756756757


In [11]:
# SelectKBest feature selection used to find 5 best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=12) 
X_ktrain = SelectKBest(chi2,k=5).fit_transform(X_train,y_train)
X_ktest = SelectKBest(chi2,k=5).fit_transform(X_test,y_test)

# knn
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_ktrain, y_train)

# knn accuracy with Kbest
knn_y_pred = knn.predict(X_ktest)
knn_y_pred
print('Accuracy is ', accuracy_score(y_test,knn_y_pred)*100)

Accuracy is  72.0720720721


In [29]:
# SelectKBest feature selection used to find 5 best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=12) 
X_ktrain = SelectKBest(f_classif,k='all').fit_transform(X_train,y_train)
X_ktest = SelectKBest(f_classif,k='all').fit_transform(X_test,y_test)

#decision tree
dt = tree.DecisionTreeClassifier(criterion='entropy', random_state=50)
dt = dt.fit(X_ktrain, y_train)

# decision tree prediction accuracy using KBest
print('Accuracy is ', dt.score(X_ktest,y_test)*100)


Accuracy is  64.8648648649
