In [None]:
import mysklearn.mypytable
import importlib
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

### Load and Prepare Data

In [None]:
cancer_table = MyPyTable()
cancer_table.load_from_file('input_data/cancer.csv')

X = []
y = []

or row in cancer_table.data:
    # Skip if row is header or empty
    if row[1] in ['M', 'B']: 
        features = []
        radius = float(row[2])
        if radius < 12:
            features.append("small")
        elif radius < 17:
            features.append("medium")
        else:
            features.append("large")

        texture = float(row[3])
        if texture < 18:
            features.append("smooth")
        elif texture < 22:
            features.append("moderate")
        else:
            features.append("rough")

        perimeter = float(row[4])
        if perimeter < 85:
            features.append("small_p")
        elif perimeter < 115:
            features.append("medium_p")
        else:
            features.append("large_p")

        X.append(features)
        y.append(row[1])  # M or B

print(f"Loaded {len(X)} instances from cancer dataset")
print(f"Sample X: {X[0]}, y: {y[0]}")
print(f"Sample X: {X[1]}, y: {y[1]}")

### Naive Bayes

In [None]:
folds = myevaluation.kfold_split(X, n_splits=10, random_state=0, shuffle=True)

nb_all_y_true = []
nb_all_y_pred = []

print("Running Naive Bayes with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    # Get train and test data for this fold
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    # Train Naive Bayes classifier
    nb_clf = MyNaiveBayesClassifier()
    nb_clf.fit(X_train, y_train)

    # Make predictions
    y_pred = nb_clf.predict(X_test)

    # Store results
    nb_all_y_true.extend(y_test)
    nb_all_y_pred.extend(y_pred)

    # Calculate and print fold accuracy
    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

# Calculate overall metrics
nb_accuracy = myevaluation.accuracy_score(nb_all_y_true, nb_all_y_pred)
nb_error_rate = 1 - nb_accuracy

print(f"\nOverall Accuracy: {nb_accuracy:.4f}")
print(f"Overall Error Rate: {nb_error_rate:.4f}")

labels = ["B", "M"]

# Calculate precision, recall, and F1 score (using 'M' as positive class for cancer)
nb_precision = myevaluation.binary_precision_score(nb_all_y_true, nb_all_y_pred,
                                                     labels=labels, pos_label="M")
nb_recall = myevaluation.binary_recall_score(nb_all_y_true, nb_all_y_pred,
                                               labels=labels, pos_label="M")
nb_f1 = myevaluation.binary_f1_score(nb_all_y_true, nb_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {nb_precision:.4f}")
print(f"Recall (Malignant): {nb_recall:.4f}")
print(f"F1 Score (Malignant): {nb_f1:.4f}")

### Random Forest

In [None]:
rf_all_y_true = []
rf_all_y_pred = []

print("Running Random Forest with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    # Get train and test data for this fold
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    # Train Random Forest classifier
    # Using n_trees=20, m_trees=7, f_attributes=2 (defaults)
    rf_clf = MyRandomForestClassifier(n_trees=20, m_trees=7, f_attributes=2, random_state=0)
    rf_clf.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test)

    # Store results
    rf_all_y_true.extend(y_test)
    rf_all_y_pred.extend(y_pred)

    # Calculate and print fold accuracy
    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

# Calculate overall metrics for Random Forest
rf_accuracy = myevaluation.accuracy_score(rf_all_y_true, rf_all_y_pred)
rf_error_rate = 1 - rf_accuracy

print(f"\nOverall Accuracy: {rf_accuracy:.4f}")
print(f"Overall Error Rate: {rf_error_rate:.4f}")

# Calculate precision, recall, and F1 score for Random Forest
rf_precision = myevaluation.binary_precision_score(rf_all_y_true, rf_all_y_pred,
                                                     labels=labels, pos_label="M")
rf_recall = myevaluation.binary_recall_score(rf_all_y_true, rf_all_y_pred,
                                               labels=labels, pos_label="M")
rf_f1 = myevaluation.binary_f1_score(rf_all_y_true, rf_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {rf_precision:.4f}")
print(f"Recall (Malignant): {rf_recall:.4f}")
print(f"F1 Score (Malignant): {rf_f1:.4f}")

### Decision Tree

In [None]:
dt_all_y_true = []
dt_all_y_pred = []

print("Running Decision Tree with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    # Get train and test data for this fold
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    # Train Decision Tree classifier
    dt_clf = MyDecisionTreeClassifier()
    dt_clf.fit(X_train, y_train)

    # Make predictions
    y_pred = dt_clf.predict(X_test)

    # Store results
    dt_all_y_true.extend(y_test)
    dt_all_y_pred.extend(y_pred)

    # Calculate and print fold accuracy
    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

# Calculate overall metrics for Decision Tree
dt_accuracy = myevaluation.accuracy_score(dt_all_y_true, dt_all_y_pred)
dt_error_rate = 1 - dt_accuracy

print(f"\nOverall Accuracy: {dt_accuracy:.4f}")
print(f"Overall Error Rate: {dt_error_rate:.4f}")

# Calculate precision, recall, and F1 score for Decision Tree
dt_precision = myevaluation.binary_precision_score(dt_all_y_true, dt_all_y_pred,
                                                     labels=labels, pos_label="M")
dt_recall = myevaluation.binary_recall_score(dt_all_y_true, dt_all_y_pred,
                                               labels=labels, pos_label="M")
dt_f1 = myevaluation.binary_f1_score(dt_all_y_true, dt_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {dt_precision:.4f}")
print(f"Recall (Malignant): {dt_recall:.4f}")
print(f"F1 Score (Malignant): {dt_f1:.4f}")

### Confusion Matrix

In [None]:
dt_confusion_matrix = myevaluation.confusion_matrix(dt_all_y_true, dt_all_y_pred, labels)

print("\nConfusion Matrix:")
print(f"{'':12} {'Predicted B':>15} {'Predicted M':>15}")
print(f"{'Actual B':12} {dt_confusion_matrix[0][0]:>15} {dt_confusion_matrix[0][1]:>15}")
print(f"{'Actual M':12} {dt_confusion_matrix[1][0]:>15} {dt_confusion_matrix[1][1]:>15}")
print("="*60)

# Comparison Summary
print("\n\n" + "="*60)
print("CLASSIFIER COMPARISON SUMMARY")
print("="*60)
print(f"{'Metric':<25} {'Naive Bayes':>15} {'Random Forest':>15} {'Decision Tree':>15}")
print("-"*75)
print(f"{'Accuracy':<25} {nb_accuracy:>15.4f} {rf_accuracy:>15.4f} {dt_accuracy:>15.4f}")
print(f"{'Error Rate':<25} {nb_error_rate:>15.4f} {rf_error_rate:>15.4f} {dt_error_rate:>15.4f}")
print(f"{'Precision (Malignant)':<25} {nb_precision:>15.4f} {rf_precision:>15.4f} {dt_precision:>15.4f}")
print(f"{'Recall (Malignant)':<25} {nb_recall:>15.4f} {rf_recall:>15.4f} {dt_recall:>15.4f}")
print(f"{'F1 Score (Malignant)':<25} {nb_f1:>15.4f} {rf_f1:>15.4f} {dt_f1:>15.4f}")
print("="*75)