In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df = pd.read_csv('breast-cancer.csv')
cols = df.columns
# Remore missing values
df2 = df[df['node-caps'] != '?']
clean_data = df2[df2['breast-quad'] != '?']

cols

Index(['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
       'breast', 'breast-quad', 'irradiat', 'Class'],
      dtype='object')

In [5]:


# Taking each categorical column and Label encoding for ML processing
age_le = preprocessing.LabelEncoder()
age_le.fit(clean_data['age'])
age_transform = age_le.transform(clean_data['age'])

meno_le = preprocessing.LabelEncoder()
meno_le.fit(clean_data['menopause'])
meno_transform = meno_le.transform(clean_data['menopause'])

tumor_le = preprocessing.LabelEncoder()
tumor_le.fit(clean_data['tumor-size'])
tumor_transform = tumor_le.transform(clean_data['tumor-size'])

inv_le = preprocessing.LabelEncoder()
inv_le.fit(clean_data['inv-nodes'])
inv_transform = inv_le.transform(clean_data['inv-nodes'])

node_le = preprocessing.LabelEncoder()
node_le.fit(clean_data['node-caps'])
node_transform = node_le.transform(clean_data['node-caps'])

breast_le = preprocessing.LabelEncoder()
breast_le.fit(clean_data['breast'])
breast_transform = breast_le.transform(clean_data['breast'])

breastq_le = preprocessing.LabelEncoder()
breastq_le.fit(clean_data['breast-quad'])
breastq_transform = breastq_le.transform(clean_data['breast-quad'])

irradiat_le = preprocessing.LabelEncoder()
irradiat_le.fit(clean_data['irradiat'])
irradiat_transform = irradiat_le.transform(clean_data['irradiat'])

# transformed categorical data sets combined with deg-malig to create final data set for ML
ds_transform = pd.DataFrame(
    {'age': age_transform,
     'menopause': meno_transform,
     'tumor-size': tumor_transform,
     'inv-nodes': inv_transform,
     'node-caps': node_transform,
     'deg-malig': clean_data['deg-malig'],
     'breast': breast_transform,
     'breast-quad': breastq_transform,
     'irradiat': irradiat_transform,
     'Class': clean_data['Class'],
    })

# Move Class column to the end
cols = list(ds_transform.columns.values)
cols.pop(cols.index('Class'))
ds_transform = ds_transform[cols+['Class']]

# Seperate attributes from Class and create train/test split
X = ds_transform.values[:, 0:9]
y = ds_transform.values[:,9]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5) 


In [6]:
from sklearn import ensemble
import sklearn.pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Use Kbest and randomforest pipeline to classify
select = sklearn.feature_selection.SelectKBest(f_classif,k='all')
clf = sklearn.ensemble.RandomForestClassifier()

steps = [('feature_selection', select),
        ('random_forest', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_train, y_train)

print('Accuracy is ', score * 100)
y_pred = pipeline.predict(X_test)
report = sklearn.metrics.classification_report(y_test, y_pred)
print(report)

Accuracy is  96.7567567568
                      precision    recall  f1-score   support

no-recurrence-events       0.78      0.92      0.84        64
   recurrence-events       0.69      0.39      0.50        28

         avg / total       0.75      0.76      0.74        92



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# GridsearchCV with knn
parameters = {'n_neighbors':[1,10],
             'algorithm':['auto', 'ball_tree','kd_tree'],
             }
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, parameters, scoring='accuracy')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)






In [7]:
ohe =pd.get_dummies(clean_data, columns=['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
       'breast', 'breast-quad', 'irradiat']).head()
cols = list(ohe.columns.values)
cols.pop(cols.index('Class'))
ohe = ohe[cols+['Class']]

X = ohe.values[:, 0:9]
y = ohe.values[:,9]

# Use Kbest and randomforest pipeline to classify
select = sklearn.feature_selection.SelectKBest(f_classif,k=7)
clf = sklearn.ensemble.RandomForestClassifier()

steps = [('feature_selection', select),
        ('random_forest', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_train, y_train)

print('Accuracy is ', score * 100)
y_pred = pipeline.predict(X_test)
report = sklearn.metrics.classification_report(y_test, y_pred)
print(report)

Accuracy is  95.1351351351
                      precision    recall  f1-score   support

no-recurrence-events       0.75      0.88      0.81        64
   recurrence-events       0.53      0.32      0.40        28

         avg / total       0.68      0.71      0.68        92

