In [None]:
import mysklearn.mypytable
import importlib
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier, MyRandomForestClassifier, MyDecisionTreeClassifier, MyDummyClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import matplotlib.pyplot
from collections import Counter

: 

### Load and Prepare Data

In [None]:
cancer_table = MyPyTable()
cancer_table.load_from_file('input_data/cancer_augmented.csv')

X = []
y = []

all_features = []
for row in cancer_table.data:
    if row[1] in ['M', 'B']:
        feature_values = [float(row[i]) for i in range(2, 32)]  # 30 features
        all_features.append(feature_values)

# quartiles for each feature
import numpy as np
quartiles = []
for feature_idx in range(30):
    feature_column = [row[feature_idx] for row in all_features]
    q1 = np.percentile(feature_column, 25)
    q2 = np.percentile(feature_column, 50)
    q3 = np.percentile(feature_column, 75)
    quartiles.append((q1, q2, q3))

# discretize features based on quartiles
for row in cancer_table.data:
    if row[1] in ['M', 'B']:
        features = []
        for feature_idx in range(30):
            value = float(row[feature_idx + 2])  
            q1, q2, q3 = quartiles[feature_idx]

            # 4 bins based on quartiles
            if value <= q1:
                features.append(f"f{feature_idx}_low")
            elif value <= q2:
                features.append(f"f{feature_idx}_med_low")
            elif value <= q3:
                features.append(f"f{feature_idx}_med_high")
            else:
                features.append(f"f{feature_idx}_high")

        X.append(features)
        y.append(row[1])  # M or B

print(f"Loaded {len(X)} instances from cancer dataset")
print(f"Sample X: {X[0]}, y: {y[0]}")
print(f"Sample X: {X[1]}, y: {y[1]}")

Loaded 1000 instances from cancer dataset
Sample X: ['f0_high', 'f1_low', 'f2_high', 'f3_high', 'f4_high', 'f5_high', 'f6_high', 'f7_high', 'f8_high', 'f9_high', 'f10_high', 'f11_med_low', 'f12_high', 'f13_high', 'f14_med_high', 'f15_high', 'f16_high', 'f17_high', 'f18_high', 'f19_high', 'f20_high', 'f21_low', 'f22_high', 'f23_high', 'f24_high', 'f25_high', 'f26_high', 'f27_high', 'f28_high', 'f29_high'], y: M
Sample X: ['f0_high', 'f1_med_low', 'f2_high', 'f3_high', 'f4_low', 'f5_med_low', 'f6_med_high', 'f7_med_high', 'f8_med_high', 'f9_low', 'f10_med_high', 'f11_low', 'f12_med_high', 'f13_high', 'f14_low', 'f15_low', 'f16_med_low', 'f17_med_high', 'f18_low', 'f19_med_high', 'f20_high', 'f21_med_low', 'f22_high', 'f23_high', 'f24_med_low', 'f25_med_low', 'f26_med_low', 'f27_high', 'f28_med_low', 'f29_med_high'], y: M


### Visualizing Class Distriubtion

In [None]:
class_counts = Counter(y)
labels = ['Benign (B)', 'Malignant (M)']
sizes = [class_counts['B'], class_counts['M']]

# Pie chart distribution
plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title("Distribution of Tumor Types")
plt.show()

### K-fold

In [None]:
folds = myevaluation.kfold_split(X, n_splits=10, random_state=0, shuffle=True)

### Naive Bayes

In [None]:
nb_all_y_true = []
nb_all_y_pred = []

print("Running Naive Bayes with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    nb_clf = MyNaiveBayesClassifier()
    nb_clf.fit(X_train, y_train)

    y_pred = nb_clf.predict(X_test)

    nb_all_y_true.extend(y_test)
    nb_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

nb_accuracy = myevaluation.accuracy_score(nb_all_y_true, nb_all_y_pred)
nb_error_rate = 1 - nb_accuracy


print(f"\nOverall Accuracy: {nb_accuracy:.4f}")
print(f"Overall Error Rate: {nb_error_rate:.4f}")

labels = ["B", "M"]

nb_precision = myevaluation.binary_precision_score(nb_all_y_true, nb_all_y_pred,
                                                     labels=labels, pos_label="M")
nb_recall = myevaluation.binary_recall_score(nb_all_y_true, nb_all_y_pred,
                                               labels=labels, pos_label="M")
nb_f1 = myevaluation.binary_f1_score(nb_all_y_true, nb_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {nb_precision:.4f}")
print(f"Recall (Malignant): {nb_recall:.4f}")
print(f"F1 Score (Malignant): {nb_f1:.4f}")

Running Naive Bayes with 10-fold cross-validation...
Fold 1: Accuracy = 0.9600
Fold 2: Accuracy = 0.9400
Fold 3: Accuracy = 0.9100
Fold 4: Accuracy = 0.9500
Fold 5: Accuracy = 0.9400
Fold 6: Accuracy = 0.9200
Fold 7: Accuracy = 0.9500
Fold 8: Accuracy = 0.9000
Fold 9: Accuracy = 0.9400
Fold 10: Accuracy = 0.9500

Overall Accuracy: 0.9360
Overall Error Rate: 0.0640

Precision (Malignant): 0.9504
Recall (Malignant): 0.9200
F1 Score (Malignant): 0.9350


### Random Forest

In [None]:
rf_all_y_true = []
rf_all_y_pred = []

print("Running Random Forest with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    rf_clf = MyRandomForestClassifier(n_trees=20, m_trees=7, f_attributes=10, random_state=0)
    rf_clf.fit(X_train, y_train)

    y_pred = rf_clf.predict(X_test)

    rf_all_y_true.extend(y_test)
    rf_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

rf_accuracy = myevaluation.accuracy_score(rf_all_y_true, rf_all_y_pred)
rf_error_rate = 1 - rf_accuracy

print(f"\nOverall Accuracy: {rf_accuracy:.4f}")
print(f"Overall Error Rate: {rf_error_rate:.4f}")

rf_precision = myevaluation.binary_precision_score(rf_all_y_true, rf_all_y_pred,
                                                     labels=labels, pos_label="M")
rf_recall = myevaluation.binary_recall_score(rf_all_y_true, rf_all_y_pred,
                                               labels=labels, pos_label="M")
rf_f1 = myevaluation.binary_f1_score(rf_all_y_true, rf_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {rf_precision:.4f}")
print(f"Recall (Malignant): {rf_recall:.4f}")
print(f"F1 Score (Malignant): {rf_f1:.4f}")

Running Random Forest with 10-fold cross-validation...
Fold 1: Accuracy = 0.9800
Fold 2: Accuracy = 0.9800
Fold 3: Accuracy = 0.9200
Fold 4: Accuracy = 0.9400
Fold 5: Accuracy = 0.9800
Fold 6: Accuracy = 0.9700
Fold 7: Accuracy = 0.9800
Fold 8: Accuracy = 0.9600
Fold 9: Accuracy = 0.9900
Fold 10: Accuracy = 0.9600

Overall Accuracy: 0.9660
Overall Error Rate: 0.0340

Precision (Malignant): 0.9755
Recall (Malignant): 0.9560
F1 Score (Malignant): 0.9657


### Decision Tree

In [None]:
dt_all_y_true = []
dt_all_y_pred = []

print("Running Decision Tree with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    dt_clf = MyDecisionTreeClassifier()
    dt_clf.fit(X_train, y_train)

    y_pred = dt_clf.predict(X_test)

    dt_all_y_true.extend(y_test)
    dt_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

# Calculate overall metrics for Decision Tree
dt_accuracy = myevaluation.accuracy_score(dt_all_y_true, dt_all_y_pred)
dt_error_rate = 1 - dt_accuracy

print(f"\nOverall Accuracy: {dt_accuracy:.4f}")
print(f"Overall Error Rate: {dt_error_rate:.4f}")

dt_precision = myevaluation.binary_precision_score(dt_all_y_true, dt_all_y_pred,
                                                     labels=labels, pos_label="M")
dt_recall = myevaluation.binary_recall_score(dt_all_y_true, dt_all_y_pred,
                                               labels=labels, pos_label="M")
dt_f1 = myevaluation.binary_f1_score(dt_all_y_true, dt_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {dt_precision:.4f}")
print(f"Recall (Malignant): {dt_recall:.4f}")
print(f"F1 Score (Malignant): {dt_f1:.4f}")

Running Decision Tree with 10-fold cross-validation...
Fold 1: Accuracy = 0.9800
Fold 2: Accuracy = 0.9700
Fold 3: Accuracy = 0.9400
Fold 4: Accuracy = 0.9200
Fold 5: Accuracy = 0.9400
Fold 6: Accuracy = 0.9900
Fold 7: Accuracy = 0.9600
Fold 8: Accuracy = 0.9300
Fold 9: Accuracy = 0.9600
Fold 10: Accuracy = 0.9600

Overall Accuracy: 0.9550
Overall Error Rate: 0.0450

Precision (Malignant): 0.9559
Recall (Malignant): 0.9540
F1 Score (Malignant): 0.9550


### Dummy Classifier

In [7]:
dummy_all_y_true = []
dummy_all_y_pred = []

print("Running Dummy Classifier with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    dummy_clf = MyDummyClassifier()
    dummy_clf.fit(X_train, y_train)

    y_pred = dummy_clf.predict(X_test)

    dummy_all_y_true.extend(y_test)
    dummy_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

# Calculate overall metrics for Dummy Classifier
dummy_accuracy = myevaluation.accuracy_score(dummy_all_y_true, dummy_all_y_pred)
dummy_error_rate = 1 - dummy_accuracy

print(f"\nOverall Accuracy: {dummy_accuracy:.4f}")
print(f"Overall Error Rate: {dummy_error_rate:.4f}")

dummy_precision = myevaluation.binary_precision_score(dummy_all_y_true, dummy_all_y_pred,
                                                     labels=labels, pos_label="M")
dummy_recall = myevaluation.binary_recall_score(dummy_all_y_true, dummy_all_y_pred,
                                               labels=labels, pos_label="M")
dummy_f1 = myevaluation.binary_f1_score(dummy_all_y_true, dummy_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {dummy_precision:.4f}")
print(f"Recall (Malignant): {dummy_recall:.4f}")
print(f"F1 Score (Malignant): {dummy_f1:.4f}")

Running Dummy Classifier with 10-fold cross-validation...
Fold 1: Accuracy = 0.4900
Fold 2: Accuracy = 0.4600
Fold 3: Accuracy = 0.4800
Fold 4: Accuracy = 0.4400
Fold 5: Accuracy = 0.3900
Fold 6: Accuracy = 0.4400
Fold 7: Accuracy = 0.4900
Fold 8: Accuracy = 0.5000
Fold 9: Accuracy = 0.4800
Fold 10: Accuracy = 0.4700

Overall Accuracy: 0.4640
Overall Error Rate: 0.5360

Precision (Malignant): 0.4550
Recall (Malignant): 0.3640
F1 Score (Malignant): 0.4044


### Confusion Matrix

In [8]:
dt_confusion_matrix = myevaluation.confusion_matrix(dt_all_y_true, dt_all_y_pred, labels)
dummy_confusion_matrix = myevaluation.confusion_matrix(dummy_all_y_true, dummy_all_y_pred, labels)

# Comparison Summary
print("\n\n" + "="*60)
print("CLASSIFIER COMPARISON SUMMARY")
print("="*60)
print(f"{'Metric':<25} {'Naive Bayes':>12} {'Random Forest':>14} {'Decision Tree':>14} {'Dummy':>12}")
print("-"*90)
print(f"{'Accuracy':<25} {nb_accuracy:>12.4f} {rf_accuracy:>14.4f} {dt_accuracy:>14.4f} {dummy_accuracy:>12.4f}")
print(f"{'Error Rate':<25} {nb_error_rate:>12.4f} {rf_error_rate:>14.4f} {dt_error_rate:>14.4f} {dummy_error_rate:>12.4f}")
print(f"{'Precision (Malignant)':<25} {nb_precision:>12.4f} {rf_precision:>14.4f} {dt_precision:>14.4f} {dummy_precision:>12.4f}")
print(f"{'Recall (Malignant)':<25} {nb_recall:>12.4f} {rf_recall:>14.4f} {dt_recall:>14.4f} {dummy_recall:>12.4f}")
print(f"{'F1 Score (Malignant)':<25} {nb_f1:>12.4f} {rf_f1:>14.4f} {dt_f1:>14.4f} {dummy_f1:>12.4f}")
print("="*90)



CLASSIFIER COMPARISON SUMMARY
Metric                     Naive Bayes  Random Forest  Decision Tree        Dummy
------------------------------------------------------------------------------------------
Accuracy                        0.9360         0.9660         0.9550       0.4640
Error Rate                      0.0640         0.0340         0.0450       0.5360
Precision (Malignant)           0.9504         0.9755         0.9559       0.4550
Recall (Malignant)              0.9200         0.9560         0.9540       0.3640
F1 Score (Malignant)            0.9350         0.9657         0.9550       0.4044


### More Data Visualization

In [None]:
AREA_WORST_IDX = 23
CONCAVE_PTS_IDX = 27

m_area_worst = [row[AREA_WORST_IDX] for row, label in zip(all_features, y) if label == 'M']
m_concave_pts = [row[CONCAVE_PTS_IDX] for row, label in zip(all_features, y) if label == 'M']

b_area_worst = [row[AREA_WORST_IDX] for row, label in zip(all_features, y) if label == 'B']
b_concave_pts = [row[CONCAVE_PTS_IDX] for row, label in zip(all_features, y) if label == 'B']

# Graph 1: Scatter Plot of Top 2 Features
plt.figure(figsize=(10, 6))
plt.scatter(b_area_worst, b_concave_pts, color='green', label='Benign', alpha=0.5)
plt.scatter(m_area_worst, m_concave_pts, color='red', label='Malignant', alpha=0.5)

plt.title("Scatter Plot: Area Worst vs. Concave Points Worst")
plt.xlabel("Area Worst")
plt.ylabel("Concave Points Worst")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# Histogram of Concave Points Worst
plt.figure(figsize=(10, 6))
plt.hist(b_concave_pts, bins=20, color='green', alpha=0.5, label='Benign', density=True)
plt.hist(m_concave_pts, bins=20, color='red', alpha=0.5, label='Malignant', density=True)

plt.title("Distribution of Concave Points Worst by Diagnosis")
plt.xlabel("Concave Points Worst")
plt.ylabel("Density")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()