In [None]:
# importing libraries

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from gmdhpy.gmdh import Regressor as model
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import joblib
from sklearn import preprocessing

In [None]:
# reading and normalizing datasets

data_wbcd = pd.read_excel('Data/wbcd.xlsx')
data_wdbc = pd.read_excel('Data/wdbc.xlsx')
data_wpbc = pd.read_excel('Data/wpbc.xlsx')

data_wbcd.loc[data_wbcd['Class'] == 'benign', 'Class'] = 0
data_wbcd.loc[data_wbcd['Class'] == 'malignant', 'Class'] = 1

data_wdbc.loc[data_wdbc['Class'] == 'benign', 'Class'] = 0
data_wdbc.loc[data_wdbc['Class'] == 'malignant', 'Class'] = 1

data_wpbc.loc[data_wpbc['Class'] == 'benign', 'Class'] = 0
data_wpbc.loc[data_wpbc['Class'] == 'malignant', 'Class'] = 1

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

names_data_wbcd = data_wbcd.columns[1:]
new_dataframe_wbcd = scaler.fit_transform(data_wbcd.iloc[:,1:])
data_wbcd = pd.DataFrame(new_dataframe_wbcd, columns=names_data_wbcd)

names_data_wdbc = data_wdbc.columns[1:]
new_dataframe_wdbc = scaler.fit_transform(data_wdbc.iloc[:,1:])
data_wdbc = pd.DataFrame(new_dataframe_wdbc, columns=names_data_wdbc)

names_data_wpbc = data_wpbc.columns[1:]
new_dataframe_wpbc = scaler.fit_transform(data_wpbc.iloc[:,1:])
data_wpbc = pd.DataFrame(new_dataframe_wpbc, columns=names_data_wpbc)

In [None]:
# feature selection

x_wbcd = data_wbcd.iloc[:,1:]
y_wbcd = data_wbcd['Class']

x_wdbc = data_wdbc.iloc[:,1:]
y_wdbc = data_wdbc['Class']

x_wpbc = data_wpbc.iloc[:,1:]
y_wpbc = data_wpbc['Class']

selector_wbcd = SelectFromModel(estimator=LogisticRegression()).fit(x_wbcd, y_wbcd)
#pd.DataFrame(selector_wbcd.estimator_.coef_).to_excel('Outputs/feature selection/features_wbcd.xlsx')

selector_wdbc = SelectFromModel(estimator=LogisticRegression()).fit(x_wdbc, y_wdbc)
#pd.DataFrame(selector_wdbc.estimator_.coef_).to_excel('Outputs/feature selection/features_wdbc.xlsx')

selector_wpbc = SelectFromModel(estimator=LogisticRegression()).fit(x_wpbc, y_wpbc)
#pd.DataFrame(selector_wpbc.estimator_.coef_).to_excel('Outputs/feature selection/features_wpbc.xlsx')

In [None]:
# creating train and test samples

train_wbcd, test_wbcd = train_test_split(data_wbcd, test_size=0.2)
train_wdbc, test_wdbc = train_test_split(data_wdbc, test_size=0.2)
train_wpbc, test_wpbc = train_test_split(data_wpbc, test_size=0.2)

x_train_wbcd = train_wbcd.iloc[:,[6, 1, 7, 3, 8]].values
y_train_wbcd = train_wbcd['Class'].values
x_test_wbcd = test_wbcd.iloc[:,[6, 1, 7, 3, 8]].values
y_test_wbcd = test_wbcd['Class'].values

x_train_wdbc = train_wdbc.iloc[:,[28, 21, 22, 23, 8, 1, 3, 24, 2, 4, 25, 7, 27, 29, 11]].values
y_train_wdbc = train_wdbc['Class'].values
x_test_wdbc = test_wdbc.iloc[:,[28, 21, 22, 23, 8, 1, 3, 24, 2, 4, 25, 7, 27, 29, 11]].values
y_test_wdbc = test_wdbc['Class'].values

x_train_wpbc = train_wpbc.iloc[:,[26, 33, 22, 32, 24, 6, 25, 14, 17, 21, 12, 31, 20, 29, 28]].values
y_train_wpbc = train_wpbc['Class'].values
x_test_wpbc = test_wpbc.iloc[:,[26, 33, 22, 32, 24, 6, 25, 14, 17, 21, 12, 31, 20, 29, 28]].values
y_test_wpbc = test_wpbc['Class'].values

In [None]:
# classifying

model_wbcd = model().fit(x_train_wbcd, y_train_wbcd)
#joblib.dump(model_wbcd, 'Outputs/models/model_wbcd.sav')

model_wdbc = model().fit(x_train_wdbc, y_train_wdbc)
#joblib.dump(model_wdbc, 'Outputs/models/model_wdbc.sav')

model_wpbc = model().fit(x_train_wpbc, y_train_wpbc)
#joblib.dump(model_wpbc, 'Outputs/models/model_wpbc.sav')

In [None]:
# validation with train data

y_pred_wbcd = (model_wbcd.predict(x_train_wbcd)>=0.5).astype(int)

y_pred_wdbc = (model_wdbc.predict(x_train_wdbc)>=0.5).astype(int)

y_pred_wpbc = (model_wpbc.predict(x_train_wpbc)>=0.5).astype(int)

AUC_wbcd = roc_auc_score(y_train_wbcd, y_pred_wbcd)
precision_wbcd = precision_score(y_train_wbcd, y_pred_wbcd)
recall_wbcd = recall_score(y_train_wbcd, y_pred_wbcd)
f1_wbcd = f1_score(y_train_wbcd, y_pred_wbcd)

AUC_wdbc = roc_auc_score(y_train_wdbc, y_pred_wdbc)
precision_wdbc = precision_score(y_train_wdbc, y_pred_wdbc)
recall_wdbc = recall_score(y_train_wdbc, y_pred_wdbc)
f1_wdbc = f1_score(y_train_wdbc, y_pred_wdbc)

AUC_wpbc = roc_auc_score(y_train_wpbc, y_pred_wpbc)
precision_wpbc = precision_score(y_train_wpbc, y_pred_wpbc)
recall_wpbc = recall_score(y_train_wpbc, y_pred_wpbc)
f1_wpbc = f1_score(y_train_wpbc, y_pred_wpbc)


validation_result = pd.DataFrame([[AUC_wbcd, precision_wbcd, recall_wbcd, f1_wbcd],\
                                  [AUC_wdbc, precision_wdbc, recall_wdbc, f1_wdbc],\
                                  [AUC_wpbc, precision_wpbc, recall_wpbc, f1_wpbc]])

validation_result.columns = ['AUC', 'Precision', 'Recall', 'F1 score']

#validation_result.to_excel('Outputs/validation results/validation train.xlsx')

confusion_matrix_wbcd = pd.DataFrame(confusion_matrix(y_train_wbcd, y_pred_wbcd))
confusion_matrix_wbcd.columns = [['Actual positive', 'Actual negative']]
confusion_matrix_wbcd.set_index([['False positive', 'False negative']])
#confusion_matrix_wbcd.to_excel('Outputs/validation results/confusion matrix wbcd train.xlsx')

confusion_matrix_wdbc = pd.DataFrame(confusion_matrix(y_train_wdbc, y_pred_wdbc))
confusion_matrix_wdbc.columns = [['Actual positive', 'Actual negative']]
confusion_matrix_wdbc.set_index([['False positive', 'False negative']])
#confusion_matrix_wdbc.to_excel('Outputs/validation results/confusion matrix wdbc train.xlsx')

confusion_matrix_wpbc = pd.DataFrame(confusion_matrix(y_train_wpbc, y_pred_wpbc))
confusion_matrix_wpbc.columns = [['Actual positive', 'Actual negative']]
confusion_matrix_wpbc.set_index([['False positive', 'False negative']])
c#onfusion_matrix_wpbc.to_excel('Outputs/validation results/confusion matrix wpbc train.xlsx')

In [None]:
# validation with test data

y_pred_wbcd = (model_wbcd.predict(x_test_wbcd)>=0.5).astype(int)

y_pred_wdbc = (model_wdbc.predict(x_test_wdbc)>=0.5).astype(int)

y_pred_wpbc = (model_wpbc.predict(x_test_wpbc)>=0.5).astype(int)

AUC_wbcd = roc_auc_score(y_test_wbcd, y_pred_wbcd)
precision_wbcd = precision_score(y_test_wbcd, y_pred_wbcd)
recall_wbcd = recall_score(y_test_wbcd, y_pred_wbcd)
f1_wbcd = f1_score(y_test_wbcd, y_pred_wbcd)

AUC_wdbc = roc_auc_score(y_test_wdbc, y_pred_wdbc)
precision_wdbc = precision_score(y_test_wdbc, y_pred_wdbc)
recall_wdbc = recall_score(y_test_wdbc, y_pred_wdbc)
f1_wdbc = f1_score(y_test_wdbc, y_pred_wdbc)

AUC_wpbc = roc_auc_score(y_test_wpbc, y_pred_wpbc)
precision_wpbc = precision_score(y_test_wpbc, y_pred_wpbc)
recall_wpbc = recall_score(y_test_wpbc, y_pred_wpbc)
f1_wpbc = f1_score(y_test_wpbc, y_pred_wpbc)


validation_result = pd.DataFrame([[AUC_wbcd, precision_wbcd, recall_wbcd, f1_wbcd],\
                                  [AUC_wdbc, precision_wdbc, recall_wdbc, f1_wdbc],\
                                  [AUC_wpbc, precision_wpbc, recall_wpbc, f1_wpbc]])

validation_result.columns = ['AUC', 'Precision', 'Recall', 'F1 score']

validation_result.set_index([['wbcd', 'wdbc', 'wpbc']])

#validation_result.to_excel('Outputs/validation results/validation test.xlsx')

confusion_matrix_wbcd = pd.DataFrame(confusion_matrix(y_test_wbcd, y_pred_wbcd))
confusion_matrix_wbcd.columns = [['Actual positive', 'Actual negative']]
confusion_matrix_wbcd.set_index([['False positive', 'False negative']])
#confusion_matrix_wbcd.to_excel('Outputs/validation results/confusion matrix wbcd test.xlsx')

confusion_matrix_wdbc = pd.DataFrame(confusion_matrix(y_test_wdbc, y_pred_wdbc))
confusion_matrix_wdbc.columns = [['Actual positive', 'Actual negative']]
confusion_matrix_wdbc.set_index([['False positive', 'False negative']])
#confusion_matrix_wdbc.to_excel('Outputs/validation results/confusion matrix wdbc test.xlsx')

confusion_matrix_wpbc = pd.DataFrame(confusion_matrix(y_test_wpbc, y_pred_wpbc))
confusion_matrix_wpbc.columns = [['Actual positive', 'Actual negative']]
confusion_matrix_wpbc.set_index([['False positive', 'False negative']])
#confusion_matrix_wpbc.to_excel('Outputs/validation results/confusion matrix wpbc test.xlsx')