In [9]:
import pandas as pd
import numpy as np


class BernoulliNaiveBayesFromScratch:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)
        self.n_features = X.shape[1]
        self.class_priors = np.zeros(self.n_classes, dtype=np.float64)
        # Probability of feature being 1 given class (P(X_i=1|y))
        self.feature_prob_1 = np.zeros(
            (self.n_classes, self.n_features), dtype=np.float64)
        # Probability of feature being 0 given class (P(X_i=0|y))
        self.feature_prob_0 = np.zeros(
            (self.n_classes, self.n_features), dtype=np.float64)

        # Laplace smoothing parameter (alpha)
        alpha = 1.0

        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.class_priors[i] = X_c.shape[0] / X.shape[0]

            for j in range(self.n_features):
                # Calculate P(X_j=1 | y=c) using Laplace smoothing
                count_1 = np.sum(X_c[:, j] == 1)
                self.feature_prob_1[i, j] = (
                    count_1 + alpha) / (X_c.shape[0] + 2 * alpha)
                # Calculate P(X_j=0 | y=c) using Laplace smoothing
                self.feature_prob_0[i, j] = 1.0 - self.feature_prob_1[i, j]

    def predict(self, X):
        predictions = [self._predict_sample(x) for x in X]
        return np.array(predictions)

    def _predict_sample(self, x):
        # Calculate log probabilities to avoid underflow
        posteriors = []

        for i, c in enumerate(self.classes):
            # Start with log prior
            posterior = np.log(self.class_priors[i])

            # Add log likelihood for each feature
            for j in range(self.n_features):
                if x[j] == 1:
                    posterior += np.log(self.feature_prob_1[i, j])
                else:
                    posterior += np.log(self.feature_prob_0[i, j])
            posteriors.append(posterior)

        # Return the class with the highest log posterior probability
        return self.classes[np.argmax(posteriors)]



# --- Data Loading and Preprocessing ---
# Load the dataset (assuming you have the data.csv file locally or can load it from a source)
# You can download the dataset from the UCI Machine Learning Repository
# or a source like Kaggle.
# For this example, we assume the CSV is available as 'data.csv'.
df = None  # Initialize df to None
try:
    df = pd.read_csv(r"C:\Users\User\OneDrive\Desktop\project semister 1\breast cancer\data.csv")
except FileNotFoundError:
    print("Dataset file not found. Please ensure 'data.csv' is in the directory.")
    # Use placeholder data or a method to fetch the data
    # Example fetching (requires internet and a specific URL):
    # url = "archive.ics.uci.edu"
    # df = pd.read_csv(url, header=None)
    # The columns will be different, so the subsequent code needs adjustment.
    # No need to exit, as the subsequent 'if df is not None' will handle it

# Only proceed if df was successfully loaded
if df is not None:
    # Create a copy for continuous features to avoid modifying the original df in place
    df_continuous = df.copy()

    # Drop unnecessary columns (e.g., ID and unnamed column) for both datasets
    df_continuous = df_continuous.drop(['id', 'Unnamed: 32'], axis=1)
    df = df.drop(['id', 'Unnamed: 32'], axis=1)

    # Convert diagnosis to binary: 'M' (Malignant) = 1, 'B' (Benign) = 0 for both datasets
    df_continuous['diagnosis'] = df_continuous['diagnosis'].map({
                                                                'M': 1, 'B': 0})
    df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

    # Binarize continuous features for Bernoulli NB: 1 if feature > mean, 0 otherwise
    # This part remains for Bernoulli NB
    for column in df.columns[1:]:  # Skip the 'diagnosis' column
        mean_val = df[column].mean()
        df[column] = (df[column] > mean_val).astype(int)

    # Split data for Bernoulli NB into features (X) and target (y)
    X = df.drop('diagnosis', axis=1).values
    y = df['diagnosis'].values


    # Split into training and testing sets (manual split as no sklearn is allowed)
    def train_test_split_manual(X_data, y_data, test_size=0.2, random_state=42):
        np.random.seed(random_state)
        indices = np.arange(X_data.shape[0])
        np.random.shuffle(indices)
        test_samples = int(X_data.shape[0] * test_size)
        test_indices = indices[:test_samples]
        train_indices = indices[test_samples:]
        X_train, X_test = X_data[train_indices], X_data[test_indices]
        y_train, y_test = y_data[train_indices], y_data[test_indices]
        return X_train, X_test, y_train, y_test

    # For Bernoulli Naive Bayes
    X_train_bnb, X_test_bnb, y_train_bnb, y_test_bnb = train_test_split_manual(
        X, y, test_size=0.2)



    # --- Train and Evaluate Bernoulli NB ---
    bnb = BernoulliNaiveBayesFromScratch()
    bnb.fit(X_train_bnb, y_train_bnb)
    y_pred_bnb = bnb.predict(X_test_bnb)

    # Calculate accuracy for Bernoulli NB
    accuracy_bnb = np.sum(y_pred_bnb == y_test_bnb) / len(y_test_bnb)
    print(f"Bernoulli Naive Bayes Accuracy: {accuracy_bnb * 100:.2f}%")

else:
    print("Cannot proceed with model training and evaluation due to missing 'data.csv'.")


Bernoulli Naive Bayes Accuracy: 98.23%
