In [8]:
# Dependencies
# Data Processing
import pandas as pd
import numpy as np
import pickle

# Modelling
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [9]:
# Load the dataset
with open('wdbc.pkl', 'rb') as file:  # read binary
    data = pickle.load(file)

df = pd.DataFrame(data)

In [10]:
# Extract features and labels
features = df.iloc[:, 2:]
label = df['malignant']# to be predicted


In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)


In [12]:
# Rule-based classifier
def rule_based_classifier(data, threshold_size, threshold_shape, threshold_texture, threshold_concavity, threshold_homogeneity, threshold_perimeter):
    # Rules
    # 1) significantly larger cells
    abnormal_size = data['radius_0'] > threshold_size
    # 2) arbitrary structures
    abnormal_shape = data['concavity_0'] > threshold_shape
    # 3) colour of cell nucleus
    abnormal_texture = data['texture_0'] > threshold_texture
    # 4) lumpy texture
    abnormal_concavity = data['concave points_0'] > threshold_concavity
    # 5) smooth texture
    abnormal_homogeneity = data['fractal dimension_0'] < threshold_homogeneity 
    # 6) spread out
    abnormal_width = data['smoothness_0'] > threshold_perimeter

    
    # Apply rules with a boolean mask to identify instances where the diagnosis is malignant
    malignant_mask = abnormal_size | abnormal_shape | abnormal_texture | abnormal_concavity | abnormal_homogeneity | abnormal_width
    
    # Assign diagnosis based on rules
    # 1) Initialize all instances as benign
    diagnosis = pd.Series('benign', index=data.index)
    # 2) When malignant_mask is true, diagnosis updated to malignant
    diagnosis[malignant_mask] = 'malignant'
    
    return diagnosis

In [13]:
# Set threshold values (you may need to adjust these based on your data)
threshold_size = df['radius_0'].mean()
threshold_shape = df['concavity_0'].mean()
threshold_texture = df['texture_0'].mean()
threshold_concavity = df['concave points_0'].mean()
threshold_homogeneity = df['fractal dimension_0'].mean()
threshold_perimeter = df['smoothness_0'].mean()

In [14]:
# Apply the rule-based classifier on the training set
train_predictions = rule_based_classifier(features, threshold_size, threshold_shape, threshold_texture, threshold_concavity, threshold_homogeneity, threshold_perimeter)

# Convert string labels to numerical labels
train_predictions_numeric = train_predictions.map({'benign': 0, 'malignant': 1})

# Evaluate the classifier on the training set
train_accuracy = accuracy_score(label, train_predictions_numeric)
train_precision = precision_score(label, train_predictions_numeric)
train_recall = recall_score(label, train_predictions_numeric)
train_f1 = f1_score(label, train_predictions_numeric)

# Display evaluation metrics for the training set
print("Training Set Metrics:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1-Score: {train_f1:.2f}")

# Apply the rule-based classifier on the test set
test_predictions = rule_based_classifier(X_test, threshold_size, threshold_shape, threshold_texture, threshold_concavity, threshold_homogeneity, threshold_perimeter)

# Convert string labels to numerical labels
test_predictions_numeric = test_predictions.map({'benign': 0, 'malignant': 1})

# Evaluate the classifier on the test set
test_accuracy = accuracy_score(y_test, test_predictions_numeric)
test_precision = precision_score(y_test, test_predictions_numeric)
test_recall = recall_score(y_test, test_predictions_numeric)
test_f1 = f1_score(y_test, test_predictions_numeric)

# Display evaluation metrics for the test set
print("\nTest Set Metrics:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1-Score: {test_f1:.2f}")


Training Set Metrics:
Accuracy: 0.41
Precision: 0.39
Recall: 1.00
F1-Score: 0.56

Test Set Metrics:
Accuracy: 0.39
Precision: 0.38
Recall: 1.00
F1-Score: 0.55
