In [49]:
# import libraries
import pandas as pd
pd.set_option('display.max_colwidth', None) #setting max colwidth to view the entire dataset when using the print() command
import matplotlib.pyplot as plt
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from IPython.display import display

# import the files
data = open("./dataset/breast-cancer.data")
feat = open("./dataset/breast-cancer.names")

data = data.read()
feat = feat.read()

# replace missing dataset attributes to NAN
data = data.replace('?','')

from io import StringIO

# convert data from str to dataframe
data = StringIO(data)
data = pd.read_csv(data, sep=",")

data.columns = ['class', 'age', 'menopause', 'tumour_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irrad']

data['class'] = data['class'].replace(['no-recurrence-events','recurrence-events'], [0,1])
data['age'] = data['age'].replace(['20-29', '30-39','40-49','50-59','60-69','70-79'],[0,1,2,3,4,5])
data['menopause'] = data['menopause'].replace(['premeno','ge40','lt40'],[0,1,2])
data['tumour_size'] = data['tumour_size'].replace(['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54'],[0,1,2,3,4,5,6,7,8,9,10])
data['inv_nodes'] = data['inv_nodes'].replace(['0-2','3-5','6-8','9-11','12-14','15-17','24-26'],[0,1,2,3,4,5,6])
data['node_caps'] = data['node_caps'].replace(['no','yes'],[0,1])
data['breast'] = data['breast'].replace(['left','right'],[0,1])
data['breast_quad'] = data['breast_quad'].replace(['left_low','left_up','right_up','right_low','central'],[0,1,2,3,4])
data['irrad'] = data['irrad'].replace(['no','yes'],[0,1])

# replace missing data 
data['node_caps'] = data['node_caps'].fillna(5)
data['breast_quad'] = data['breast_quad'].fillna(8)

X = data.drop(['class'], axis = 1)
y = data['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train['size_agg'] = X_train['tumour_size'] + X_train['deg_malig'] 

X_test['size_agg'] = X_test['tumour_size'] + X_test['deg_malig']

In [50]:
"""
def model(X_train, X_test, y_train, y_test, feat, feat2, feat3):

    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    model = DecisionTreeClassifier() 

    model.fit(X_resampled, y_resampled)

    y_score = model.predict_proba(X_test)[:, 1]

    y_pred = model.predict(X_test)

    acc = model.score(X_train, y_train)
    f1 = f1_score(y_test, y_pred, average='binary')
    roc = roc_auc_score(y_test, y_score)
    rec = recall_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)

    if f1 > 0.68:
        print('\n','Analysed feature:',feat,'//', feat2,'//', feat3)
        print(X_train.columns)
        print(ros)
        print('Acc:',acc)
        print('F1:',f1)
        print('ROC-AUC:',roc)
        print('Recc:',rec)
        print('Prec:',pre)  
   

"""

"\ndef model(X_train, X_test, y_train, y_test, feat, feat2, feat3):\n\n    ros = RandomOverSampler(random_state=42)\n    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)\n    model = DecisionTreeClassifier() \n\n    model.fit(X_resampled, y_resampled)\n\n    y_score = model.predict_proba(X_test)[:, 1]\n\n    y_pred = model.predict(X_test)\n\n    acc = model.score(X_train, y_train)\n    f1 = f1_score(y_test, y_pred, average='binary')\n    roc = roc_auc_score(y_test, y_score)\n    rec = recall_score(y_test, y_pred)\n    pre = precision_score(y_test, y_pred)\n\n    if f1 > 0.68:\n        print('\n','Analysed feature:',feat,'//', feat2,'//', feat3)\n        print(X_train.columns)\n        print(ros)\n        print('Acc:',acc)\n        print('F1:',f1)\n        print('ROC-AUC:',roc)\n        print('Recc:',rec)\n        print('Prec:',pre)  \n   \n\n"

In [51]:
"""
features = ['age', 'menopause', 'tumour_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irrad']
def mix(features, X_train, X_test, y_train, y_test):
    for feat in features:
        for feat2 in features:
            for feat3 in features:
                important = ['tumour_size', 'breast_quad', 'size_agg']
                important.append(feat)
                important.append(feat2)
                important.append(feat3)
                X_train_n = X_train[important]
                X_test_n = X_test[important]
                model(X_train_n, X_test_n, y_train, y_test, feat, feat2, feat3)

"""

"\nfeatures = ['age', 'menopause', 'tumour_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irrad']\ndef mix(features, X_train, X_test, y_train, y_test):\n    for feat in features:\n        for feat2 in features:\n            for feat3 in features:\n                important = ['tumour_size', 'breast_quad', 'size_agg']\n                important.append(feat)\n                important.append(feat2)\n                important.append(feat3)\n                X_train_n = X_train[important]\n                X_test_n = X_test[important]\n                model(X_train_n, X_test_n, y_train, y_test, feat, feat2, feat3)\n\n"

In [52]:
#mix(features, X_train, X_test, y_train, y_test)

In [53]:
important_features = ['tumour_size', 'breast_quad', 'size_agg']
X_train = X_train[important_features]
X_test = X_test[important_features]

In [54]:
def mix(X_train, y_train):
    k = 1
    while k < 68:
        print(k)
        smote = SMOTE(random_state=42,  k_neighbors=k)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        model = DecisionTreeClassifier()
        model.fit(X_resampled, y_resampled)
        y_score = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)
        acc = model.score(X_train, y_train)
        f1 = f1_score(y_test, y_pred, average='binary')
        roc = roc_auc_score(y_test, y_score)
        rec = recall_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred)        
        if f1 > 0.58:
            print('Acc:',acc)
            print('F1:',f1)
            print('ROC-AUC:',roc)
            print('Recc:',rec)
            print('Prec:',pre)  
        k += 1




In [55]:
mix(X_train, y_train)

1
Acc: 0.8070175438596491
F1: 0.6315789473684211
ROC-AUC: 0.7330882352941176
Recc: 0.7058823529411765
Prec: 0.5714285714285714
2
3
Acc: 0.8201754385964912
F1: 0.5882352941176471
ROC-AUC: 0.7264705882352941
Recc: 0.5882352941176471
Prec: 0.5882352941176471
4
5
6
7
Acc: 0.8201754385964912
F1: 0.6060606060606061
ROC-AUC: 0.7404411764705883
Recc: 0.5882352941176471
Prec: 0.625
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
