In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#loading training and testing sets
projectDirPath = os.path.abspath("")

X_train = pd.read_csv(projectDirPath + "\\ready data\\X_train.csv").values
X_test = pd.read_csv(projectDirPath + "\\ready data\\X_test.csv").values
y_train = pd.read_csv(projectDirPath + "\\ready data\\y_train.csv").values.reshape(-1,)
y_test = pd.read_csv(projectDirPath + "\\ready data\\y_test.csv").values.reshape(-1,)

In [3]:
#training the model with linear border
from sklearn.linear_model import LogisticRegression

logisticRegression = LogisticRegression()
logisticRegression.fit(X_train, y_train)

LogisticRegression()

In [4]:
#applying transformation on features to achive non-linear decision border
from sklearn.preprocessing import PolynomialFeatures

transformation = PolynomialFeatures(degree = 2)
X_train_t = transformation.fit_transform(X_train)
X_test_t = transformation.fit_transform(X_test)

#training new model on transformed features
polyLogisticRegression = LogisticRegression(max_iter = 1800)
polyLogisticRegression.fit(X_train_t, y_train)

LogisticRegression(max_iter=1800)

In [5]:
#validation of model with linear border using k-cross validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, recall_score

accuracies = cross_val_score(estimator = logisticRegression, X = X_train, y = y_train, cv = 10)

recall_scorer =  make_scorer(recall_score, pos_label = 4)
recalls = cross_val_score(estimator = logisticRegression, X = X_train, y = y_train, scoring = recall_scorer, cv = 10)

accMean = accuracies.mean()
accStdDev = accuracies.std()
recMean = recalls.mean()
recStdDev = recalls.std()

print("mean of 10 accuracies: ", accMean)
print("standard deviation of accuracies: ", recStdDev)
print("mean of 10 recalls: ", accMean)
print("standard deviation of recalls: ", recStdDev)

mean of 10 accuracies:  0.9674891146589258
standard deviation of accuracies:  0.045369716056529404
mean of 10 recalls:  0.9674891146589258
standard deviation of recalls:  0.045369716056529404


In [6]:
#validation of model with non-linear border using k-cross validation
accuraciesP = cross_val_score(estimator = polyLogisticRegression, X = X_train_t, y = y_train, cv = 10)

recallsP = cross_val_score(estimator = polyLogisticRegression, X = X_train_t, y = y_train, scoring = recall_scorer, cv = 10)

accMeanP = accuraciesP.mean()
accStdDevP = accuraciesP.std()
recMeanP = recalls.mean()
recStdDevP = recallsP.std()

print("mean of 10 accuracies: ", accMeanP)
print("standard deviation of accuracies: ", recStdDevP)
print("mean of 10 recalls: ", accMeanP)
print("standard deviation of recalls: ", recStdDevP)

mean of 10 accuracies:  0.9425979680696661
standard deviation of accuracies:  0.07587986902175793
mean of 10 recalls:  0.9425979680696661
standard deviation of recalls:  0.07587986902175793


In [29]:
#testing of model with linear border on the test set and computing : accuracy, recall and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = logisticRegression.predict(X_test)
y_pred_p = polyLogisticRegression.predict(X_test_t)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred, pos_label = 4)
cm = confusion_matrix(y_test, y_pred)


print("accuracy on the test set: ", acc)
print("recall on the test set: ", rec)
print("confusion matrix:\n ", cm)

accuracy on the test set:  0.9657142857142857
recall on the test set:  0.9310344827586207
confusion matrix:
  [[115   2]
 [  4  54]]


In [30]:
#testing of model with non-linear border on the test set and computing : accuracy, recall and confusion matrix
y_pred_p = polyLogisticRegression.predict(X_test_t)

accP = accuracy_score(y_test, y_pred_p)
recP = recall_score(y_test, y_pred_p, pos_label = 4)
cmP = confusion_matrix(y_test, y_pred_p)


print("accuracy on the test set: ", accP)
print("recall on the test set: ", recP)
print("confusion matrix:\n ", cmP)

accuracy on the test set:  0.9371428571428572
recall on the test set:  0.8793103448275862
confusion matrix:
  [[113   4]
 [  7  51]]


In [31]:
#saving the valid model based on higher recall
from joblib import dump

dump(logisticRegression, projectDirPath + "\\models\\logisticRegression.joblib")

['C:\\Users\\misla\\Desktop\\breast cancer tumor classification\\models\\logisticRegression.joblib']

In [33]:
#saving evaluation data for all tried out models
import json

#evaluation data for all the used models (will be used in streamlit app)
linearData = {"mean10Acc" : accMean, "accStd" : accStdDev, "mean10Rec" : recMean, "recStd" : recStdDev, "acc": acc, "rec": rec}
polyData = {"mean10Acc" : accMeanP, "accStd" : accStdDevP, "mean10Rec" : recMeanP, "recStd" : recStdDevP, "acc": accP, "rec": recP}
linearCm = {"tn" : int(cm[0, 0]), "fn" : int(cm[1, 0]), "tp" : int(cm[1, 1]), "fp" : int(cm[0, 1])}
polyCm = {"tn" : int(cmP[0, 0]), "fn" : int(cmP[1, 0]), "tp" : int(cmP[1, 1]), "fp" : int(cmP[0, 1])}

with open(projectDirPath + "\\modelsData\\logisticRegression.json", "w") as file:
    json.dump(linearData, file)

with open(projectDirPath + "\\modelsData\\logisticRegressionPoly.json", "w") as file:
    json.dump(polyData, file)

with open(projectDirPath + "\\modelsData\\logisticRegressionCm.json", "w") as file:
    json.dump(linearCm, file)
    
with open(projectDirPath + "\\modelsData\\logisticRegressionPolyCm.json", "w") as file:
    json.dump(polyCm, file)