# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

# NB Algorithma

## Importing libraries

In [19]:
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from collections import defaultdict
import os
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

## Loading the dataset

In [12]:
# Load test data
test_folder = '../data/test/'
test_files = os.listdir(test_folder)
test_df = pd.DataFrame()
for file in test_files:
    df = pd.read_csv(test_folder + file)
    test_df = pd.concat([test_df, df], axis=1)

# Load train data
train_folder = '../data/train/'
train_files = os.listdir(train_folder)
train_df = pd.DataFrame()
for file in train_files:
    df = pd.read_csv(train_folder + file)
    train_df = pd.concat([train_df, df], axis=1)


### NB Implementation

In [13]:
class ImprovedNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, feature_selection_threshold=0.01, smoothing=1e-10):
        self.classes_ = None 
        self.class_probabilities = {}
        self.feature_probabilities = {}
        self.feature_weights = None
        self.feature_selection_threshold = feature_selection_threshold
        self.smoothing = smoothing
    
    def feature_selection(self, X, y):
        mi_scores = mutual_info_classif(X, y)
        selected_features = mi_scores > self.feature_selection_threshold
        return selected_features
    
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        selected_features = self.feature_selection(X, y)
        X = X[:, selected_features]
        
        self.feature_weights = selected_features
        self.classes_ = np.unique(y)  
        n_samples, n_features = X.shape
        
        for cls in self.classes_:
            self.class_probabilities[cls] = np.sum(y == cls) / n_samples
        
        self.feature_probabilities = {}
        
        for cls in self.classes_:
            X_cls = X[y == cls]
            cls_feature_probs = []
            for feature_idx in range(n_features):
                feature_values = X_cls[:, feature_idx]
                unique_vals, counts = np.unique(feature_values, return_counts=True)
                
                prob_dict = {val: (count + self.smoothing) / (len(feature_values) + self.smoothing * len(unique_vals)) 
                             for val, count in zip(unique_vals, counts)}
                cls_feature_probs.append(prob_dict)
            
            self.feature_probabilities[cls] = cls_feature_probs
        
        return self
    
    def predict(self, X):
        X = np.array(X)
        
        if self.feature_weights is not None:
            X = X[:, self.feature_weights]
        
        predictions = []
        
        for sample in X:
            class_scores = {}
            
            for cls in self.classes_:
                score = np.log(self.class_probabilities[cls])
                
                for feature_idx, feature_val in enumerate(sample):
                    feature_prob = self.feature_probabilities[cls][feature_idx].get(feature_val, self.smoothing)
                    score += np.log(feature_prob)
                
                class_scores[cls] = score
            
            predictions.append(max(class_scores, key=class_scores.get))
        
        return np.array(predictions)
    
    def get_params(self, deep=True):
        return {"feature_selection_threshold": self.feature_selection_threshold, "smoothing": self.smoothing}

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [24]:
def preprocess_data(train_df, test_size=0.3, random_state=42):
    X = train_df.drop('attack_cat', axis=1)
    y = train_df['attack_cat']
    
    numeric_columns = X.select_dtypes(include=['number']).columns
    categorical_columns = X.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].median())
    
    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0])
    
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
    
    le = LabelEncoder()
    for col in categorical_columns:
        X[col] = le.fit_transform(X[col])
    
    y_encoded = y
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=test_size, stratify=y_encoded, random_state=random_state
    )
    
    return X_train, X_test, y_train, y_test


In [25]:
def evaluate_model(X_train, y_train, model, cv_folds=5):
    kfold = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    
    print(f"\nCross-Validation Accuracy (Mean): {cross_val_scores.mean() * 100:.2f}%")
    print(f"Cross-Validation Accuracy (Standard Deviation): {cross_val_scores.std() * 100:.2f}%")
    
    return cross_val_scores.mean()

X_train, X_test, y_train, y_test = preprocess_data(train_df)

nb = ImprovedNaiveBayes(feature_selection_threshold=0.01)
nb.fit(X_train, y_train)

# evaluate_model(X_train, y_train, nb, cv_folds=5)

predictions = nb.predict(X_test)

nb_sklearn = BernoulliNB()
nb_sklearn.fit(X_train, y_train)

# evaluate_model(X_train, y_train, nb_sklearn, cv_folds=5)

predictions_sklearn = nb_sklearn.predict(X_test)

accuracy_custom = accuracy_score(y_test, predictions)
print(f"\nNaive Bayes kustom classification accuracy: {accuracy_custom * 100:.2f}%")
print("\nDetailed Classification Report (Custom Naive Bayes):")
print(classification_report(y_test, predictions))

accuracy_sklearn = accuracy_score(y_test, predictions_sklearn)
print(f"\nNaive Bayes from sklearn classification accuracy: {accuracy_sklearn * 100:.2f}%")
print("\nDetailed Classification Report (sklearn Naive Bayes):")
print(classification_report(y_test, predictions_sklearn))




Cross-Validation Accuracy (Mean): 79.93%
Cross-Validation Accuracy (Standard Deviation): 0.25%

Cross-Validation Accuracy (Mean): 70.77%
Cross-Validation Accuracy (Standard Deviation): 0.21%
