In [None]:
import mysklearn.mypytable
import importlib
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier, MyRandomForestClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import matplotlib.pyplot
from collections import Counter

: 

### Load and Prepare Data

In [None]:
cancer_table = MyPyTable()
cancer_table.load_from_file('input_data/cancer.csv')

X = []
y = []

all_features = []
for row in cancer_table.data:
    if row[1] in ['M', 'B']:
        feature_values = [float(row[i]) for i in range(2, 32)]  # 30 features
        all_features.append(feature_values)

# quartiles for each feature
import numpy as np
quartiles = []
for feature_idx in range(30):
    feature_column = [row[feature_idx] for row in all_features]
    q1 = np.percentile(feature_column, 25)
    q2 = np.percentile(feature_column, 50)
    q3 = np.percentile(feature_column, 75)
    quartiles.append((q1, q2, q3))

# discretize features based on quartiles
for row in cancer_table.data:
    if row[1] in ['M', 'B']:
        features = []
        for feature_idx in range(30):
            value = float(row[feature_idx + 2])  
            q1, q2, q3 = quartiles[feature_idx]

            # 4 bins based on quartiles
            if value <= q1:
                features.append(f"f{feature_idx}_low")
            elif value <= q2:
                features.append(f"f{feature_idx}_med_low")
            elif value <= q3:
                features.append(f"f{feature_idx}_med_high")
            else:
                features.append(f"f{feature_idx}_high")

        X.append(features)
        y.append(row[1])  # M or B

print(f"Loaded {len(X)} instances from cancer dataset")
print(f"Sample X: {X[0]}, y: {y[0]}")
print(f"Sample X: {X[1]}, y: {y[1]}")

### Visualizing Class Distriubtion

In [None]:
class_counts = Counter(y)
labels = ['Benign (B)', 'Malignant (M)']
sizes = [class_counts['B'], class_counts['M']]

# Pie chart distribution
plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title("Distribution of Tumor Types")
plt.show()

### K-fold

In [None]:
folds = myevaluation.kfold_split(X, n_splits=10, random_state=0, shuffle=True)

### Naive Bayes

In [None]:
nb_all_y_true = []
nb_all_y_pred = []

print("Running Naive Bayes with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    nb_clf = MyNaiveBayesClassifier()
    nb_clf.fit(X_train, y_train)

    y_pred = nb_clf.predict(X_test)

    nb_all_y_true.extend(y_test)
    nb_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

nb_accuracy = myevaluation.accuracy_score(nb_all_y_true, nb_all_y_pred)
nb_error_rate = 1 - nb_accuracy


print(f"\nOverall Accuracy: {nb_accuracy:.4f}")
print(f"Overall Error Rate: {nb_error_rate:.4f}")

labels = ["B", "M"]

nb_precision = myevaluation.binary_precision_score(nb_all_y_true, nb_all_y_pred,
                                                     labels=labels, pos_label="M")
nb_recall = myevaluation.binary_recall_score(nb_all_y_true, nb_all_y_pred,
                                               labels=labels, pos_label="M")
nb_f1 = myevaluation.binary_f1_score(nb_all_y_true, nb_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {nb_precision:.4f}")
print(f"Recall (Malignant): {nb_recall:.4f}")
print(f"F1 Score (Malignant): {nb_f1:.4f}")

### Random Forest

In [None]:
rf_all_y_true = []
rf_all_y_pred = []

print("Running Random Forest with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    rf_clf = MyRandomForestClassifier(n_trees=20, m_trees=7, f_attributes=10, random_state=0)
    rf_clf.fit(X_train, y_train)

    y_pred = rf_clf.predict(X_test)

    rf_all_y_true.extend(y_test)
    rf_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

rf_accuracy = myevaluation.accuracy_score(rf_all_y_true, rf_all_y_pred)
rf_error_rate = 1 - rf_accuracy

print(f"\nOverall Accuracy: {rf_accuracy:.4f}")
print(f"Overall Error Rate: {rf_error_rate:.4f}")

rf_precision = myevaluation.binary_precision_score(rf_all_y_true, rf_all_y_pred,
                                                     labels=labels, pos_label="M")
rf_recall = myevaluation.binary_recall_score(rf_all_y_true, rf_all_y_pred,
                                               labels=labels, pos_label="M")
rf_f1 = myevaluation.binary_f1_score(rf_all_y_true, rf_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {rf_precision:.4f}")
print(f"Recall (Malignant): {rf_recall:.4f}")
print(f"F1 Score (Malignant): {rf_f1:.4f}")

### Decision Tree

In [None]:
dt_all_y_true = []
dt_all_y_pred = []

print("Running Decision Tree with 10-fold cross-validation...")
print("="*60)

for fold_idx, (train_indices, test_indices) in enumerate(folds):
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]

    dt_clf = MyDecisionTreeClassifier()
    dt_clf.fit(X_train, y_train)

    y_pred = dt_clf.predict(X_test)

    dt_all_y_true.extend(y_test)
    dt_all_y_pred.extend(y_pred)

    fold_accuracy = myevaluation.accuracy_score(y_test, y_pred)
    print(f"Fold {fold_idx + 1}: Accuracy = {fold_accuracy:.4f}")

print("="*60)

# Calculate overall metrics for Decision Tree
dt_accuracy = myevaluation.accuracy_score(dt_all_y_true, dt_all_y_pred)
dt_error_rate = 1 - dt_accuracy

print(f"\nOverall Accuracy: {dt_accuracy:.4f}")
print(f"Overall Error Rate: {dt_error_rate:.4f}")

dt_precision = myevaluation.binary_precision_score(dt_all_y_true, dt_all_y_pred,
                                                     labels=labels, pos_label="M")
dt_recall = myevaluation.binary_recall_score(dt_all_y_true, dt_all_y_pred,
                                               labels=labels, pos_label="M")
dt_f1 = myevaluation.binary_f1_score(dt_all_y_true, dt_all_y_pred,
                                      labels=labels, pos_label="M")

print(f"\nPrecision (Malignant): {dt_precision:.4f}")
print(f"Recall (Malignant): {dt_recall:.4f}")
print(f"F1 Score (Malignant): {dt_f1:.4f}")

### Visualizations of Classifier Performance

In [None]:
# Classifier comparison bar chart
metrics = ['Accuracy', 'Precision (M)', 'Recall (M)', 'F1 Score (M)']
nb_vals = [nb_accuracy, nb_precision, nb_recall, nb_f1]
rf_vals = [rf_accuracy, rf_precision, rf_recall, rf_f1]
dt_vals = [dt_accuracy, dt_precision, dt_recall, dt_f1]
x = np.arange(len(metrics))
width = 0.25
plt.figure(figsize=(10,6))
plt.bar(x - width, nb_vals, width, label='Naive Bayes')
plt.bar(x, rf_vals, width, label='Random Forest')
plt.bar(x + width, dt_vals, width, label='Decision Tree')
plt.xticks(x, metrics)
plt.ylabel("Score")
plt.title("Classifier Performance Comparison")
plt.legend()
plt.ylim(0,1)
plt.show()

top_5_indices = [23, 27, 7, 20, 6]
top_5_names = ['Area Worst', 'Concave Pts Worst', 'Concave Pts Mean', 'Radius Worst', 'Concavity Mean']

# Create a figure with 5 subplots arranged in a row
plt.figure(figsize=(20, 6))

for i, (feature_idx, feature_name) in enumerate(zip(top_5_indices, top_5_names)):
    plt.subplot(1, 5, i + 1)
    
    # Extract data for Benign and Malignant for the current feature
    b_data = [row[feature_idx] for row, label in zip(all_features, y) if label == 'B']
    m_data = [row[feature_idx] for row, label in zip(all_features, y) if label == 'M']
    
    # Create the boxplot
    plt.boxplot([b_data, m_data], labels=['Benign', 'Malignant'], patch_artist=True)
    plt.title(feature_name)
    plt.grid(True, linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

### Confusion Matrix

In [None]:
dt_confusion_matrix = myevaluation.confusion_matrix(dt_all_y_true, dt_all_y_pred, labels)

print("\nConfusion Matrix:")
print(f"{'':12} {'Predicted B':>15} {'Predicted M':>15}")
print(f"{'Actual B':12} {dt_confusion_matrix[0][0]:>15} {dt_confusion_matrix[0][1]:>15}")
print(f"{'Actual M':12} {dt_confusion_matrix[1][0]:>15} {dt_confusion_matrix[1][1]:>15}")
print("="*60)

# Comparison Summary
print("\n\n" + "="*60)
print("CLASSIFIER COMPARISON SUMMARY")
print("="*60)
print(f"{'Metric':<25} {'Naive Bayes':>15} {'Random Forest':>15} {'Decision Tree':>15}")
print("-"*75)
print(f"{'Accuracy':<25} {nb_accuracy:>15.4f} {rf_accuracy:>15.4f} {dt_accuracy:>15.4f}")
print(f"{'Error Rate':<25} {nb_error_rate:>15.4f} {rf_error_rate:>15.4f} {dt_error_rate:>15.4f}")
print(f"{'Precision (Malignant)':<25} {nb_precision:>15.4f} {rf_precision:>15.4f} {dt_precision:>15.4f}")
print(f"{'Recall (Malignant)':<25} {nb_recall:>15.4f} {rf_recall:>15.4f} {dt_recall:>15.4f}")
print(f"{'F1 Score (Malignant)':<25} {nb_f1:>15.4f} {rf_f1:>15.4f} {dt_f1:>15.4f}")
print("="*75)

### More Data Visualization

In [None]:
AREA_WORST_IDX = 23
CONCAVE_PTS_IDX = 27

m_area_worst = [row[AREA_WORST_IDX] for row, label in zip(all_features, y) if label == 'M']
m_concave_pts = [row[CONCAVE_PTS_IDX] for row, label in zip(all_features, y) if label == 'M']

b_area_worst = [row[AREA_WORST_IDX] for row, label in zip(all_features, y) if label == 'B']
b_concave_pts = [row[CONCAVE_PTS_IDX] for row, label in zip(all_features, y) if label == 'B']

# Graph 1: Scatter Plot of Top 2 Features
plt.figure(figsize=(10, 6))
plt.scatter(b_area_worst, b_concave_pts, color='green', label='Benign', alpha=0.5)
plt.scatter(m_area_worst, m_concave_pts, color='red', label='Malignant', alpha=0.5)

plt.title("Scatter Plot: Area Worst vs. Concave Points Worst")
plt.xlabel("Area Worst")
plt.ylabel("Concave Points Worst")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# Histogram of Concave Points Worst
plt.figure(figsize=(10, 6))
plt.hist(b_concave_pts, bins=20, color='green', alpha=0.5, label='Benign', density=True)
plt.hist(m_concave_pts, bins=20, color='red', alpha=0.5, label='Malignant', density=True)

plt.title("Distribution of Concave Points Worst by Diagnosis")
plt.xlabel("Concave Points Worst")
plt.ylabel("Density")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()