# Journal Classification Analysis

This notebook performs classification analysis on journal data using various machine learning models. It includes:
- Data loading and preprocessing
- Multiple classification model evaluation
- Performance metrics calculation
- Learning curve visualization

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import ast
from sklearn.model_selection import StratifiedKFold, cross_val_score, LearningCurveDisplay, ShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

In [2]:
def load_data(target: str, csv_path: str) -> tuple:
    """
    Load and preprocess data from a CSV file.
    :param target: One of 'all', 'pqc', 'browser', 'os', or 'algo'.
    :param csv_path: Path to the CSV file to load.
    :return: Tuple (data, labels) where data is a numpy array and labels is another numpy array.
    """
    try:
        data = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error loading CSV at '{csv_path}': {e}")
        return None, None
    
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])
    
    labels = data.pop('label').values
    
    # Log label counts
    unique, counts = np.unique(labels, return_counts=True)
    print("Label distribution:", dict(zip(unique, counts)))
    
    rows = data.to_numpy()
    filtered_rows, filtered_labels = [], []
    
    for row, label in zip(rows, labels):
        unit_digit = label % 10
        
        # Filter based on target
        if target == 'pqc':
            # PQC includes all samples (unit digits 0, 1, 2)
            pass
        elif target in ['browser', 'os', 'algo']:
            # These targets only include samples with unit digits 1 or 2
            if unit_digit == 0:
                continue
        elif target == 'all':
            # All includes everything
            pass
        else:
            print(f"Unknown target: {target}")
            return None, None
        
        filtered_rows.append([ast.literal_eval(cell) for cell in row])
        filtered_labels.append(label)
    
    X = np.array([np.array(r).flatten() for r in filtered_rows])
    y = np.array(filtered_labels)
    
    print(f"Total samples: {len(y)}")
    
    # Process labels based on target
    y_proc = []
    
    if target == 'pqc':
        # Binary classification: 0 (Not Using PQC) vs 1 (Using PQC)
        for label in y:
            unit_digit = label % 10
            if unit_digit == 0:
                y_proc.append(0)  # Not using PQC
            else:  # unit_digit is 1 or 2
                y_proc.append(1)  # Using PQC
        y_proc = np.array(y_proc)
    elif target == 'browser':
        # Extract browser info from tens digit: 10 (Firefox), 20 (Chrome)
        y_proc = ((y // 10) % 10) * 10
    elif target == 'os':
        # Extract OS info from hundreds digit: 100 (Linux), 200 (Windows), 400 (MacOS)
        y_proc = (y // 100) * 100
    elif target == 'algo':
        # Extract algorithm info from unit digit: 1 (Kyber), 2 (ML-KEM)
        y_proc = y % 10
    elif target == 'all':
        # Keep original labels
        y_proc = y
    else:
        print(f"Unknown target: {target}")
        return None, None
    
    return X, y_proc

In [3]:
def run_and_organize(data, idx, labels, model, model_names, results):
    """
    Performs stratified 10-fold cross-validation on the given model.
    """
    skf = StratifiedKFold(n_splits=10)
    accuracy = cross_val_score(model, data, labels, cv=skf, scoring='accuracy')
    precision = cross_val_score(model, data, labels, cv=skf, scoring='precision_weighted')
    recall = cross_val_score(model, data, labels, cv=skf, scoring='recall_weighted')
    f1 = cross_val_score(model, data, labels, cv=skf, scoring='f1_weighted')
    auc = cross_val_score(model, data, labels, cv=skf, scoring='roc_auc_ovr')

    acc_res = f"{accuracy.mean():.2f} +/- {accuracy.std():.2f}"
    prec_res = f"{precision.mean():.2f} +/- {precision.std():.2f}"
    rec_res = f"{recall.mean():.2f} +/- {recall.std():.2f}"
    f1_res = f"{f1.mean():.2f} +/- {f1.std():.2f}"
    auc_res = f"{auc.mean():.2f} +/- {auc.std():.2f}"

    results.loc[len(results)] = [
        model_names[idx], acc_res, prec_res, rec_res, f1_res, auc_res
    ]

In [4]:
def run_models(target: str, csv_path: str) -> None:
    """
    Runs multiple classification models on a given dataset.

    This function loads and encodes the data for the specified target label and
    number of packets. It then trains multiple classifiers, evaluates each model's
    performance, and saves the results in a CSV file.

    Args:
        target (str): The label or feature name to predict/classify.
        amount (int): The number of packets or rows to include in the loaded dataset.

    Returns:
        None
    """
    # Load and encode data
    data, labels = load_data(target, csv_path)
    labels = LabelEncoder().fit_transform(labels)

    # Estimators and matching names
    models = [
        RandomForestClassifier(),
        XGBClassifier(),
        LogisticRegression(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(),
        GaussianNB(),
        AdaBoostClassifier(),
        GradientBoostingClassifier()
    ]
    model_names = [
        'Random Forest',
        'XGBoost',
        'Logistic Regression',
        'KNN',
        'Decision Tree',
        'MLP',
        'Naive Bayes',
        'AdaBoost',
        'Gradient Boosting'
    ]

    # Sort models and names
    model_names, models = zip(*sorted(zip(model_names, models)))

    # Prepare results storage
    results = pd.DataFrame(
        columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC']
    )

    # Run each model
    for idx, model in enumerate(models):
        run_and_organize(data, idx, labels, model, model_names, results)

    # Save to CSV
    results.to_csv(f'docker-res-{target}.csv')


## Run the Analysis

Set your parameters below and run the analysis. The available targets are:
- 'pqc': Post-quantum cryptography classification
- 'algo': Algorithm classification
- 'tuple': Tuple classification
- 'all': All features
- 'browser': Browser classification
- 'os': Operating system classification

In [5]:
# Set your parameters here
targets = ['pqc', 'algo', 'all', 'browser']  # Choose from: 'pqc', 'algo', 'tuple', 'all', 'browser', 'os'
csv_path = 'C:\\Users\\Eylon\\PQC\\tdl\\pqc-paob-docker-20.csv'  # Your CSV file path

# Run the analysis
for target in targets:
    results = run_models(target, csv_path)

#

Label distribution: {110: 100, 111: 100, 112: 100, 120: 100, 121: 100, 122: 98}
Total samples: 598
Label distribution: {110: 100, 111: 100, 112: 100, 120: 100, 121: 100, 122: 98}
Total samples: 398
Label distribution: {110: 100, 111: 100, 112: 100, 120: 100, 121: 100, 122: 98}
Total samples: 598
Label distribution: {110: 100, 111: 100, 112: 100, 120: 100, 121: 100, 122: 98}
Total samples: 398
