# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

# NB Algorithma

## Importing libraries

In [27]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets
import matplotlib.pyplot as plt
from collections import defaultdict
import os
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

## Loading the dataset

In [123]:
import os
import pandas as pd

def load_data_from_folder(folder_path):
    files = os.listdir(folder_path)
    
    df_list = []
    
    for file in files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        df_list.append(df)
    
    combined_df = df_list[0]
    
    for df in df_list[1:]:
        combined_df = pd.merge(combined_df, df, on='id', how='outer')
    
    return combined_df

test_folder = '../data/test/'
test_df = load_data_from_folder(test_folder)

train_folder = '../data/train/'
train_df = load_data_from_folder(train_folder)


### NB Implementation

In [148]:
class ImprovedNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, feature_selection_threshold=0.01, smoothing=1e-10):
        self.classes_ = None 
        self.class_probabilities = {}
        self.feature_probabilities = {}
        self.feature_weights = None
        self.feature_selection_threshold = feature_selection_threshold
        self.smoothing = smoothing
    
    def feature_selection(self, X, y):
        mi_scores = mutual_info_classif(X, y)
        selected_features = mi_scores > self.feature_selection_threshold
        return selected_features
    
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        selected_features = self.feature_selection(X, y)
        X = X[:, selected_features]
        
        self.feature_weights = selected_features
        self.classes_ = np.unique(y)  
        n_samples, n_features = X.shape
        
        for cls in self.classes_:
            self.class_probabilities[cls] = np.sum(y == cls) / n_samples
        
        self.feature_probabilities = {}
        
        for cls in self.classes_:
            X_cls = X[y == cls]
            cls_feature_probs = []
            for feature_idx in range(n_features):
                feature_values = X_cls[:, feature_idx]
                unique_vals, counts = np.unique(feature_values, return_counts=True)
                
                prob_dict = {val: (count + self.smoothing) / (len(feature_values) + self.smoothing * len(unique_vals)) 
                             for val, count in zip(unique_vals, counts)}
                cls_feature_probs.append(prob_dict)
            
            self.feature_probabilities[cls] = cls_feature_probs
        
        return self
    
    def predict(self, X):
        X = np.array(X)
        
        if self.feature_weights is not None:
            X = X[:, self.feature_weights]
        
        predictions = []
        
        for sample in X:
            class_scores = {}
            
            for cls in self.classes_:
                score = np.log(self.class_probabilities[cls])
                
                for feature_idx, feature_val in enumerate(sample):
                    feature_prob = self.feature_probabilities[cls][feature_idx].get(feature_val, self.smoothing)
                    score += np.log(feature_prob)
                
                class_scores[cls] = score
            
            predictions.append(max(class_scores, key=class_scores.get))
        
        return np.array(predictions)
    
    def get_params(self, deep=True):
        return {"feature_selection_threshold": self.feature_selection_threshold, "smoothing": self.smoothing}

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self
    
    def save_model(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self, file)
        print(f"Model saved in {filename}.")

    @staticmethod
    def load_model(filename):
        with open(filename, 'rb') as file:
            model = pickle.load(file)
        print(f"Model loaded from {filename}.")
        return model
    
    def submit(self, X, output_filename="predictions.csv"):
        predictions = self.predict(X)
        
        prediction_df = pd.DataFrame({
            'id': range(len(predictions)),
            'attack_cat': predictions
        })
        
        prediction_df.to_csv(output_filename, index=False)
        print(f"Predictions saved to {output_filename}.")
        
        return prediction_df

In [164]:
def preprocess_data(train_df, test_size=0.3, random_state=42):
    X = train_df.drop(['attack_cat', 'label'], axis=1)
    y = train_df['attack_cat']
    
    numeric_columns = X.select_dtypes(include=['number']).columns
    categorical_columns = X.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].mean())
    
    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0])
    
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
    
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    
    y_encoded = y
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=test_size, stratify=y_encoded, random_state=random_state
    )
    
    return X_train, X_test, y_train, y_test, scaler, label_encoders


In [170]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

def preprocess_test_data(test_df, scaler, label_encoders):
    X_test = test_df.copy()
    
    numeric_columns = X_test.select_dtypes(include=['number']).columns
    categorical_columns = X_test.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X_test[col] = X_test[col].fillna(X_test[col].mean())
    
    for col in categorical_columns:
        X_test[col] = X_test[col].fillna(X_test[col].mode()[0])

    X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

    for col in categorical_columns:
        if col in label_encoders:
            le = label_encoders[col]
            if 'unknown' not in le.classes_:
                le.classes_ = np.append(le.classes_, 'unknown')
            
            X_test[col] = X_test[col].apply(lambda x: x if x in le.classes_ else 'unknown')
            X_test[col] = le.transform(X_test[col])
            print(f"Encoded categorical column: {col}")
    
    return X_test


In [165]:
def evaluate_model(X_train, y_train, model, cv_folds=5):
    kfold = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    
    print(f"\nCross-Validation Accuracy (Mean): {cross_val_scores.mean() * 100:.2f}%")
    print(f"Cross-Validation Accuracy (Standard Deviation): {cross_val_scores.std() * 100:.2f}%")
    
    return cross_val_scores.mean()

X_train, X_test, y_train, y_test, scaler, label_encoders = preprocess_data(train_df)

nb = ImprovedNaiveBayes(feature_selection_threshold=0.01)
nb.fit(X_train, y_train)
nb.save_model('naive_bayes_model.pkl')
# prediction_df = nb.submit(X_test, output_filename="predictions.csv")

evaluate_model(X_train, y_train, nb, cv_folds=5)

predictions = nb.predict(X_test)

nb_sklearn = BernoulliNB()
nb_sklearn.fit(X_train, y_train)

evaluate_model(X_train, y_train, nb_sklearn, cv_folds=5)

predictions_sklearn = nb_sklearn.predict(X_test)

accuracy_custom = accuracy_score(y_test, predictions)
print(f"\nNaive Bayes kustom classification accuracy: {accuracy_custom * 100:.2f}%")
print("\nDetailed Classification Report (Custom Naive Bayes):")
print(classification_report(y_test, predictions))

accuracy_sklearn = accuracy_score(y_test, predictions_sklearn)
print(f"\nNaive Bayes from sklearn classification accuracy: {accuracy_sklearn * 100:.2f}%")
print("\nDetailed Classification Report (sklearn Naive Bayes):")
print(classification_report(y_test, predictions_sklearn))



Model saved in naive_bayes_model.pkl.

Cross-Validation Accuracy (Mean): 74.88%
Cross-Validation Accuracy (Standard Deviation): 0.32%

Cross-Validation Accuracy (Mean): 64.65%
Cross-Validation Accuracy (Standard Deviation): 0.19%

Naive Bayes kustom classification accuracy: 75.00%

Detailed Classification Report (Custom Naive Bayes):
                precision    recall  f1-score   support

      Analysis       0.57      0.18      0.27       600
      Backdoor       0.04      0.20      0.07       524
           DoS       0.31      0.61      0.41      3679
      Exploits       0.74      0.53      0.62     10018
       Fuzzers       0.59      0.68      0.63      5455
       Generic       0.98      0.98      0.98     12000
        Normal       0.98      0.82      0.89     16800
Reconnaissance       0.82      0.71      0.76      3148
     Shellcode       0.64      0.68      0.66       340
         Worms       0.35      0.64      0.45        39

      accuracy                           0.75 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       600
      Backdoor       0.00      0.00      0.00       524
           DoS       0.24      0.77      0.36      3679
      Exploits       0.67      0.39      0.49     10018
       Fuzzers       0.40      0.56      0.47      5455
       Generic       0.92      0.96      0.94     12000
        Normal       0.95      0.74      0.83     16800
Reconnaissance       0.55      0.01      0.01      3148
     Shellcode       0.00      0.00      0.00       340
         Worms       0.02      0.62      0.03        39

      accuracy                           0.64     52603
     macro avg       0.37      0.40      0.31     52603
  weighted avg       0.73      0.64      0.65     52603



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [169]:
loaded_model = ImprovedNaiveBayes.load_model('naive_bayes_model.pkl')
X_test_processed = preprocess_test_data(test_df, scaler, label_encoders)
prediction_df = loaded_model.submit(X_test_processed, output_filename="predictions.csv")

# Gunakan model yang dimuat untuk prediksi




# predictionss = loaded_model.predict(X_test)
# accuracy_custom = accuracy_score(y_test, predictionss)
# print(f"\nNaive Bayes kustom classification accuracy: {accuracy_custom * 100:.2f}%")
# print("\nDetailed Classification Report (Custom Naive Bayes):")
# print(classification_report(y_test, predictionss))

Model loaded from naive_bayes_model.pkl.
Encoded categorical column: state
Encoded categorical column: service
Encoded categorical column: proto
Predictions saved to predictions.csv.
