In [1]:
import numpy as np
import pandas as pd

In [2]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_probabilities = self.calculate_class_probabilities(y)
        self.feature_probabilities = self.calculate_feature_probabilities(X, y)

In [3]:
def calculate_class_probabilities(self, y):
        class_counts = np.bincount(y)
        return class_counts / float(np.sum(class_counts))

In [4]:
def calculate_feature_probabilities(self, X, y):
        feature_probabilities = {}

        for feature_index in range(X.shape[1]):
            feature_values = np.unique(X[:, feature_index])
            probabilities = {}

            for cls in self.classes:
                cls_indices = np.where(y == cls)
                cls_feature_values = X[cls_indices, feature_index][0]
                value_counts = np.bincount(cls_feature_values)

                probabilities[cls] = (value_counts + 1) / float(np.sum(value_counts) + len(feature_values))

            feature_probabilities[feature_index] = probabilities

        return feature_probabilities


In [5]:
def predict(self, X):
        predictions = []

        for i in range(X.shape[0]):
            probabilities = []

            for cls in self.classes:
                class_probability = self.class_probabilities[cls]

                for feature_index in range(X.shape[1]):
                    feature_value = X[i, feature_index]
                    feature_probability = self.feature_probabilities[feature_index][cls][feature_value]
                    class_probability *= feature_probability

                probabilities.append(class_probability)

            predicted_class = np.argmax(probabilities)
            predictions.append(predicted_class)

        return np.array(predictions)

In [14]:
def preprocess_data(df):
    # Perform data preprocessing steps here
    # For example, handle missing values, encode categorical variables, normalize numerical features, etc.
    # Make sure to return the preprocessed data as X and y

    # Drop non-informative columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Handle missing values
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)

    # Convert categorical variables to one-hot encoding
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'])

    # Convert DataFrame to numpy arrays
    X = df.drop('Survived', axis=1).values
    y = df['Survived'].values

    return X, y
