In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import os
import numpy as np

def load_and_sample_data(dataset_folder, total_length, normal_ratio):
    """
    Loads datasets, samples a specific ratio of normal data, and balances remaining among attack datasets.

    Args:
        dataset_folder (str): Path to the folder containing all datasets.
        total_length (int): Total length of the training dataset to be sampled.
        normal_ratio (float): Ratio of normal samples in the dataset (0.0 to 1.0).

    Returns:
        torch.Tensor: Sampled data features.
        torch.Tensor: Corresponding labels.
    """
    # Identify dataset files
    exclude_keywords = ["randomreplay", "masqueradefakenormal", "masqueradefakefault", "poisonedhighrate"]
    dataset_files = [f for f in os.listdir(dataset_folder) if f.endswith('.npy') and not any(keyword in f.lower() for keyword in exclude_keywords)]
    normal_file = [f for f in dataset_files if 'normal' in f.lower()]
    attack_files = [f for f in dataset_files if 'normal' not in f.lower()]

    if not normal_file:
        raise ValueError("No normal dataset file found in the folder.")

    # Load the normal dataset
    normal_data = np.load(os.path.join(dataset_folder, normal_file[0]))
    
    # Sample normal data
    normal_length = int(total_length * normal_ratio)
    if len(normal_data) < normal_length:
        raise ValueError(f"Not enough normal data: {len(normal_data)} samples available, {normal_length} required.")
    sampled_normal = normal_data[np.random.choice(len(normal_data), normal_length, replace=False)]
    normal_labels = np.zeros(normal_length)  # Label for normal data is 0

    # Load and sample attack data
    attack_length_per_file = int((total_length - normal_length) / len(attack_files))
    sampled_attack_data = []
    sampled_attack_labels = []

    for attack_file in attack_files:
        attack_data = np.load(os.path.join(dataset_folder, attack_file))
        if len(attack_data) < attack_length_per_file:
            raise ValueError(f"Not enough data in {attack_file}: {len(attack_data)} samples available, {attack_length_per_file} required.")
        sampled_attack = attack_data[np.random.choice(len(attack_data), attack_length_per_file, replace=False)]
        sampled_attack_data.append(sampled_attack)
        sampled_attack_labels.extend([1] * attack_length_per_file)  # Label for attack data is 1

    # Combine normal and attack data
    all_data = np.concatenate([sampled_normal] + sampled_attack_data, axis=0)
    all_labels = np.concatenate([normal_labels, sampled_attack_labels], axis=0)

    # Shuffle the combined dataset
    shuffle_indices = np.random.permutation(len(all_data))
    all_data = all_data[shuffle_indices]
    all_labels = all_labels[shuffle_indices]

    # Convert to PyTorch tensors
    return np.array(all_data, dtype=np.float32), np.array(all_labels, dtype=np.float32)



In [2]:

dataset_folder = "/home/mfaizan/programs/my_project/data/weak_classifiers/training_preprocessed"
total_length = 100000
normal_ratio = 0.5
# Create synthetic dataset
X, y = load_and_sample_data(dataset_folder, total_length, normal_ratio)
    # Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models and parameter grids
models_params = {
    # 'Logistic Regression': (LogisticRegression(max_iter=100000), {'C': [0.1, 1, 10, 100]}),
    'Logistic Regression': (LogisticRegression(max_iter=100000), {'C': [0.1]}),
    # 'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None], 'min_samples_split': [2]}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}),
    'SVM': (make_pipeline(StandardScaler(), SVC()), {'svc__C': [0.1], 'svc__gamma': [ 0.1]}),
    'Naive Bayes': (GaussianNB(), {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]})
}

best_models = {}

# Loop through models, perform GridSearchCV, and print out results
for name, (model, params) in models_params.items():
    grid = GridSearchCV(model, params, cv=5)
    grid.fit(X_train, y_train)
    
    # Storing the best model
    best_models[name] = grid.best_estimator_
    
    print(f"{name}:")
    print(f"Best parameters: {grid.best_params_}")
    print(f"Best training accuracy: {grid.best_score_:.3f}")
    print()  # Print a blank line for readability

# Now best_models dict contains the best model for each type
for name, model in best_models.items():
    print(f"The best model for {name} is: {model}\n")

Logistic Regression:
Best parameters: {'C': 0.1}
Best training accuracy: 1.000

Decision Tree:
Best parameters: {'max_depth': None, 'min_samples_split': 2}
Best training accuracy: 1.000

Random Forest:
Best parameters: {'max_depth': None, 'n_estimators': 10}
Best training accuracy: 1.000

SVM:
Best parameters: {'svc__C': 0.1, 'svc__gamma': 0.1}
Best training accuracy: 0.985

Naive Bayes:
Best parameters: {'var_smoothing': 1e-09}
Best training accuracy: 0.875

The best model for Logistic Regression is: LogisticRegression(C=0.1, max_iter=100000)

The best model for Decision Tree is: DecisionTreeClassifier()

The best model for Random Forest is: RandomForestClassifier(n_estimators=10)

The best model for SVM is: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=0.1, gamma=0.1))])

The best model for Naive Bayes is: GaussianNB()



In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score

def evaluate_models_on_tests(best_models, X_test_list, y_test_list, names):
    # Check if the lists are of the same length
    if not (len(X_test_list) == len(y_test_list) == len(names)):
        raise ValueError("All input lists must have the same length.")
    
    # Dictionary to hold accuracy scores
    accuracy_dict = {}

    # Evaluate each model on each test set
    for name, model in best_models.items():
        accuracies = []
        for X_test, y_test in zip(X_test_list, y_test_list):
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)
        accuracy_dict[name] = accuracies

    # Convert accuracy dictionary to DataFrame
    accuracy_df = pd.DataFrame(accuracy_dict, index=names)
    
    # Save to CSV
    # accuracy_df.to_csv('/home/mfaizan/programs/gpt2_powersystems/final_testings/base_code/basic_model_accuracies3.csv')
    
    return accuracy_df

def load_and_label_data(directory):
    data_list = []
    y_list = []
    headers = []

    for file_name in os.listdir(directory):
        if file_name.endswith('.npy'):
            # Load the numpy array
            data = np.load(os.path.join(directory, file_name))
            data_list.append(data)

            # Generate header
            headers.append(file_name.replace('_test.npy', '').replace('_', ' ').capitalize())

            # Assign label based on 'normal' presence in file name
            if 'normal' in file_name.lower():
                y_list.append(np.zeros((len(data),1)))
            else:
                y_list.append(np.ones((len(data),1)))
    
    return data_list, y_list, headers

# Example usage
directory = '/home/mfaizan/programs/my_project/data/weak_classifiers/testing_preprocessed'  # Replace with your actual directory path
Xtest_List, ytest_List, headers = load_and_label_data(directory)

# Evaluate all best models on all test sets
accuracy_results = evaluate_models_on_tests(best_models, Xtest_List, ytest_List, headers)

# Print the accuracy results
for model_name, accuracies in accuracy_results.items():
    print(f"Accuracies for {model_name}:")
    for test_name, accuracy in accuracies.items():
        print(f"{test_name}: {accuracy:.3f}")
    print()  # Print a blank line for readability


Accuracies for Logistic Regression:
Goose inversereplay: 1.000
Goose poisonedhighrate: 1.000
Goose masqueradefakenormal: 0.102
Goose randomreplay: 1.000
Goose injection: 1.000
Goose normal: 1.000
Sv highstnum: 1.000
Goose highstnum: 1.000
Sv injection: 1.000
Goose masqueradefakefault: 1.000

Accuracies for Decision Tree:
Goose inversereplay: 1.000
Goose poisonedhighrate: 1.000
Goose masqueradefakenormal: 0.000
Goose randomreplay: 1.000
Goose injection: 1.000
Goose normal: 1.000
Sv highstnum: 1.000
Goose highstnum: 1.000
Sv injection: 1.000
Goose masqueradefakefault: 1.000

Accuracies for Random Forest:
Goose inversereplay: 1.000
Goose poisonedhighrate: 1.000
Goose masqueradefakenormal: 0.008
Goose randomreplay: 0.993
Goose injection: 1.000
Goose normal: 1.000
Sv highstnum: 1.000
Goose highstnum: 1.000
Sv injection: 1.000
Goose masqueradefakefault: 0.859

Accuracies for SVM:
Goose inversereplay: 1.000
Goose poisonedhighrate: 1.000
Goose masqueradefakenormal: 0.091
Goose randomreplay: 1.