In [None]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install tqdm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

data = pd.read_csv("ecg_data_summary.csv")
data.head(10)

In [None]:
data["Arrhythmia"].astype(int).hist()

In [None]:
data["Arrhythmia"] = data["Arrhythmia"].astype(int)
numeric_data = data.select_dtypes(include=[float, int])
corr = numeric_data.corr(method="pearson")
import seaborn as sns
import matplotlib.pyplot as plt

cmap = sns.diverging_palette(250, 354, 80, 60, center='dark', as_cmap=True)
sns.heatmap(corr, vmax=1, vmin=-0.5, cmap=cmap, square=True, linewidths=0.2)
plt.show()


In [None]:
data = data[["Age", "BPM", "Average_RR_Interval", "Min_RR_Interval", "Max_RR_Interval", "PT_Interval", "RR_Median", "Arrhythmia"]]
data.head(10)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Selecting relevant features
data = data[["Age", "BPM", "Average_RR_Interval", "Min_RR_Interval", 
             "Max_RR_Interval", "PT_Interval", "RR_Median", "Arrhythmia"]]

# Plot for Age
plt.figure(figsize=(8, 6))
sns.histplot(data, x="Age", kde=True, color='r')
plt.title("Distribution of Age")
plt.tight_layout()
plt.show()

# Plot for BPM
plt.figure(figsize=(8, 6))
sns.histplot(data, x="BPM", kde=True, color='b')
plt.title("Distribution of BPM")
plt.tight_layout()
plt.show()

# Plot for Average RR Interval
plt.figure(figsize=(8, 6))
sns.histplot(data, x="Average_RR_Interval", kde=True, color='g')
plt.title("Distribution of Average RR Interval")
plt.tight_layout()
plt.show()

# Plot for Min RR Interval
plt.figure(figsize=(8, 6))
sns.histplot(data, x="Min_RR_Interval", kde=True, color='orange')
plt.title("Distribution of Min RR Interval")
plt.tight_layout()
plt.show()

# Plot for Max RR Interval
plt.figure(figsize=(8, 6))
sns.histplot(data, x="Max_RR_Interval", kde=True, color='purple')
plt.title("Distribution of Max RR Interval")
plt.tight_layout()
plt.show()

# Plot for PT Interval
plt.figure(figsize=(8, 6))
sns.histplot(data, x="PT_Interval", kde=True, color='brown')
plt.title("Distribution of PT Interval")
plt.tight_layout()
plt.show()

# Plot for RR Median
plt.figure(figsize=(8, 6))
sns.histplot(data, x="RR_Median", kde=True, color='pink')
plt.title("Distribution of RR Median")
plt.tight_layout()
plt.show()

# Plot for Arrhythmia
plt.figure(figsize=(8, 6))
sns.histplot(data, x="Arrhythmia", kde=True, color='cyan')
plt.title("Distribution of Arrhythmia")
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from tqdm import tqdm

data = data[["Age", "BPM", "Average_RR_Interval", "Min_RR_Interval", "Max_RR_Interval", "PT_Interval", "RR_Median", "Arrhythmia"]]

def calculate_prior(df, Y):
    prior = df[Y].value_counts(normalize=True).to_dict()
    return prior

def calculate_likelihood_categorical(df, feat_name, feat_val, Y, label):
    # Filter rows by class label
    df_label = df[df[Y] == label]
    
    # probability P(X | Y) for each feature value
    p_x_given_y = len(df_label[df_label[feat_name] == feat_val]) / len(df_label) if len(df_label) > 0 else 0
    return p_x_given_y

def naive_bayes_categorical(df, X, Y):
    # feature names (all columns except target Y)
    features = list(df.columns)[:-1]
    
    # Calculate prior probabilities
    prior = calculate_prior(df, Y)
    
    Y_pred = []
    
    # Loop over every test sample
    for x in tqdm(X, desc="Processing samples", unit="sample"):
        # Initialize likelihood for each label
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        
        # likelihood
        for j in range(len(labels)):
            for i in range(len(features)):
                feat_name = features[i]
                feat_val = x[i]
                likelihood[j] *= calculate_likelihood_categorical(df, feat_name, feat_val, Y, labels[j])
        
        # posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[labels[j]]
        
        # Predict maximum posterior probability
        Y_pred.append(labels[np.argmax(post_prob)])
    
    return np.array(Y_pred)

# training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=41)

# Prepare features (X) and target (Y) for testing
X_test = test.iloc[:, :-1].values  # All columns except 'Arrhythmia'
Y_test = test.iloc[:, -1].values   # The 'Arrhythmia' column

# Train the Naive Bayes model
Y_pred = naive_bayes_categorical(train, X=X_test, Y="Arrhythmia")

# Evaluate the model's performance
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print("\nF1 Score:")
print(f1_score(Y_test, Y_pred))
