In [100]:
# import libraries
import pandas as pd
pd.set_option('display.max_colwidth', None) #setting max colwidth to view the entire dataset when using the print() command
import matplotlib.pyplot as plt
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from IPython.display import display

# import the files
data = open("./dataset/breast-cancer.data")
feat = open("./dataset/breast-cancer.names")

data = data.read()
feat = feat.read()

# replace missing dataset attributes to NAN
data = data.replace('?','')

from io import StringIO

# convert data from str to dataframe
data = StringIO(data)
data = pd.read_csv(data, sep=",")

data.columns = ['class', 'age', 'menopause', 'tumour_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irrad']

data['class'] = data['class'].replace(['no-recurrence-events','recurrence-events'], [0,1])
data['age'] = data['age'].replace(['20-29', '30-39','40-49','50-59','60-69','70-79'],[0,1,2,3,4,5])
data['menopause'] = data['menopause'].replace(['premeno','ge40','lt40'],[0,1,2])
data['tumour_size'] = data['tumour_size'].replace(['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54'],[0,1,2,3,4,5,6,7,8,9,10])
data['inv_nodes'] = data['inv_nodes'].replace(['0-2','3-5','6-8','9-11','12-14','15-17','24-26'],[0,1,2,3,4,5,6])
data['node_caps'] = data['node_caps'].replace(['no','yes'],[0,1])
data['breast'] = data['breast'].replace(['left','right'],[0,1])
data['breast_quad'] = data['breast_quad'].replace(['left_low','left_up','right_up','right_low','central'],[0,1,2,3,4])
data['irrad'] = data['irrad'].replace(['no','yes'],[0,1])

# replace missing data 
data['node_caps'] = data['node_caps'].fillna(5)
data['breast_quad'] = data['breast_quad'].fillna(8)

X = data.drop(['class'], axis = 1)
y = data['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train['size_agg'] = X_train['tumour_size'] + X_train['deg_malig'] 
X_train['breast_tumour'] = X_train['breast_quad'] + X_train['inv_nodes']

X_test['size_agg'] = X_test['tumour_size'] + X_test['deg_malig']
X_test['breast_tumour'] = X_test['breast_quad'] + X_test['inv_nodes']

In [101]:
features = ['age', 'menopause', 'tumour_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irrad']

def mix(X_train, y_train, X_test, y_test, features):
    for feat in features:
        for feat2 in features:
            for feat3 in features:
                for feat4 in features:
                    important_features = ['size_agg', 'breast_tumour']
                    X_train['meta'] = X_train[feat2] + X_train[feat] + X_train[feat3] + X_train[feat4]
                    X_test['meta'] = X_test[feat2] + X_test[feat] + X_test[feat3] + X_test[feat4]
                    important_features.append('meta')
                    X_newtrain = X_train[important_features]
                    X_newtest = X_test[important_features]
                    ros = RandomOverSampler(random_state=42)
                    X_resampled, y_resampled = ros.fit_resample(X_newtrain, y_train)
                    model = RandomForestClassifier()
                    model.fit(X_resampled, y_resampled)
                    y_score = model.predict_proba(X_newtest)[:, 1]
                    y_pred = model.predict(X_newtest)
                    f1 = f1_score(y_test, y_pred, average='binary')
                    roc = roc_auc_score(y_test, y_score)
                    rec = recall_score(y_test, y_pred)
                    pre = precision_score(y_test, y_pred)        
                    if f1 >= 0.68:
                        print('YES WITH:', feat, feat2, feat3, feat4)
                        print('F1:',f1)
                        print('ROC-AUC:',roc)
                        print('Recc:',rec)
                        print('Prec:',pre)
    

In [102]:
mix(X_train, y_train, X_test, y_test, features)