In [None]:
%%capture
%pip install qiskit torch scikit-learn pandas numpy matplotlib jupyter qiskit-machine-learning scipy qiskit-algorithms

In [None]:
from qiskit_algorithms.utils import algorithm_globals
import pandas as pd
import torch
import numpy as np
from torch import nn
from qiskit.circuit.library import EfficientSU2, RealAmplitudes, ZZFeatureMap
from qiskit.circuit import QuantumCircuit
from sklearn.preprocessing import MinMaxScaler
from qiskit.primitives import Sampler
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.neural_networks import SamplerQNN
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.stats import entropy



In [None]:
dataset = pd.read_csv('./KDDTrain+.txt')

In [None]:
dataset.head()

In [None]:
columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot'
,'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations'
,'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate'
,'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count'
,'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate'
,'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','outcome','level'])

In [None]:
dataset.columns = columns

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe().style.background_gradient(cmap='Blues').set_properties(**{'font-family':'Segoe UI'})

In [None]:
dataset.loc[dataset['outcome'] == "normal", "outcome"] = 'normal'
dataset.loc[dataset['outcome'] != 'normal', "outcome"] = 'attack'

In [None]:
def pie_plot(df, cols_list, rows, cols):
    fig, axes = plt.subplots(rows, cols)
    for ax, col in zip(axes.ravel(), cols_list):
        df[col].value_counts().plot(ax=ax, kind='pie', figsize=(15, 15), fontsize=10, autopct='%1.0f%%')
        ax.set_title(str(col), fontsize = 12)
    plt.show()

In [None]:
pie_plot(dataset, ['protocol_type', 'outcome'], 1, 2)

In [None]:
from sklearn.preprocessing import RobustScaler

def Scaling(df_num, cols):
    std_scaler = RobustScaler()
    std_scaler_temp = std_scaler.fit_transform(df_num)
    std_df = pd.DataFrame(std_scaler_temp, columns =cols)
    return std_df

In [None]:
cat_cols = ['is_host_login','protocol_type','service','flag','land', 'logged_in','is_guest_login', 'level', 'outcome']
def preprocess(dataframe):
    df_num = dataframe.drop(cat_cols, axis=1)
    num_cols = df_num.columns
    scaled_df = Scaling(df_num, num_cols)
    
    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]
    
    dataframe = pd.get_dummies(dataframe, columns = ['protocol_type', 'service', 'flag'])
    return dataframe

In [None]:
dataset = preprocess(dataset)

In [None]:
X = dataset.drop(columns=['outcome']).values

y = dataset['outcome'].values
scaler = MinMaxScaler()

X = scaler.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA

ncomp=4

# Apply PCA to the dataset
pca = PCA(n_components=ncomp)  # Set the number of components to 10
X_pca = pca.fit_transform(X)
X_pca = scaler.fit_transform(X_pca)

# Print the explained variance ratio to see how much variance is captured by each component
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Convert the PCA result to a DataFrame for better visualization
pca_df = pd.DataFrame(data=X_pca, columns=[f'Principal Component {i+1}' for i in range(ncomp)])
pca_df['Outcome'] = y

# Display the first few rows of the PCA result
pca_df.head()

In [None]:
X = X_pca

In [None]:
outlier_dataT = X[y == 'attack']
#benign_data_testT = X[y == 'normal']

In [None]:
benign_dataT = X[y == 'normal']

In [None]:
train_size=8000
test_size=2000
n_device=4


outlier_data = outlier_dataT[:test_size]
#outlier_data2 = outlier_dataT[500:1000]
benign_data_test = benign_dataT[:test_size]
#benign_data_test2 = benign_dataT[500:1000]

outlier_tensor = torch.tensor(outlier_data, dtype=torch.float32)
benign_tensor = torch.tensor(benign_data_test, dtype=torch.float32)

#outlier_tensor2 = torch.tensor(outlier_data2, dtype=torch.float32)
#benign_tensor2 = torch.tensor(benign_data_test2, dtype=torch.float32)

In [None]:
benign_data = benign_dataT[test_size:train_size+test_size]

In [None]:
# Set the random seed for reproducibility
algorithm_globals.random_seed = 123456

In [None]:
# Define number of qubits based on the number of features in your data
num_features = benign_data.shape[1]
print(num_features)
num_qubits = num_features

In [None]:
qc = QuantumCircuit(num_qubits)
qc.h(qc.qubits)  # Apply Hadamard to create superposition

feature_map = ZZFeatureMap(
        feature_dimension=num_features,
        entanglement='full'  # pattern of entanglement
    )

ansatz = EfficientSU2(num_qubits, reps=6)
qc.compose(feature_map, inplace=True)
qc.compose(ansatz, inplace=True)

In [None]:
qc.num_parameters

In [None]:
# Sampler to evaluate the quantum circuit
shots = 10000

#new seed for each generator
sampler=[]
for i in range (n_device):
    sampler.append(Sampler(options={"shots": shots, "seed": algorithm_globals.random_seed + i}))


In [None]:
from qiskit.circuit import ParameterVector, QuantumCircuit  # Import ParameterVector

# Define the quantum generator using TorchConnector
def create_generator(i) -> TorchConnector:

    # Specify the input_params to match the noise dimension
    input_params = ParameterVector("input_params", num_features)

    # Modify the circuit to include input_params
    qc_with_input = qc.copy()
    for j in range(num_features):
        qc_with_input.rx(input_params[j], j)  # Example: Apply rotation based on input_params

    qnn = SamplerQNN(
        circuit=qc_with_input,  # Use the modified circuit
        sampler=sampler[i],
        input_params=input_params,
        weight_params=qc_with_input.parameters[num_features:], # weight params are all params except input_params
        sparse=False,
        output_shape=num_features
    )
    initial_weights = algorithm_globals.random.random(qc_with_input.num_parameters - num_features) # initial weights are for weight params only
    return TorchConnector(qnn, initial_weights)
   

#generator = create_generator()
# Create generators and discriminators for three QGAN instances
generators = [create_generator(i) for i in range(n_device)]

In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_size):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.LeakyReLU(0.2),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Create a discriminator
#discriminator = Discriminator(input_size=num_features)
discriminators = [Discriminator(input_size=num_features) for _ in range(n_device)]

In [None]:

from torch.optim import Adam

lrd = 0.0006  # learning rate
lrg = 0.001  # learning rate
b1 = 0.7  # first momentum parameter
b2 = 0.999  # second momentum parameter

optimizers_g = [Adam(generator.parameters(), lr=lrg, betas=(b1, b2), weight_decay=0.005) for generator in generators]
optimizers_d = [Adam(discriminator.parameters(), lr=lrd, betas=(b1, b2), weight_decay=0.005) for discriminator in discriminators]

# Binary Cross-Entropy loss function
criterion = nn.BCELoss()

# Label smoothing
real_label = 1
fake_label = 0

In [None]:
def compare_distributions_federated(real_data_federated, generated_data_federated, feature_names, instance_idx=None):
    """
    Compare distributions of real and generated data for a federated setup.

    Args:
    - real_data: numpy array of real benign data
    - generated_data_federated: list of numpy arrays, each representing generated data from a QGAN instance
    - feature_names: list of feature names for labeling
    - instance_idx: index of the specific QGAN instance to compare, or None for the averaged model
    """
    import matplotlib.pyplot as plt
    import seaborn as sns

    real_data_federated = real_data_federated.numpy() if torch.is_tensor(real_data_federated) else real_data_federated

    #print(real_data_federated.shape)
    #print(generated_data_federated.shape)

    if instance_idx is None:  # Average generated data from all instances
        generated_data = sum(generated_data_federated) / len(generated_data_federated)
        title = "Federated (Averaged) Generated Data"
    else:
        generated_data = generated_data_federated[instance_idx]
        title = f"Generated Data: Instance {instance_idx + 1}"

    if instance_idx is None:  # Average generated data from all instances
        real_data = sum(real_data_federated) / len(real_data_federated)
        title1 = "Federated (Averaged) Real Data"
    else:
        real_data = real_data_federated[instance_idx]
        title1 = f"Real Data: Instance {instance_idx + 1}"

    # Create plots
    fig, axes = plt.subplots(1, ncomp, figsize=(20, 4))
    fig.suptitle(f'Data Distribution: {title1} vs {title}', fontsize=16)

    for i in range(ncomp):
        sns.histplot(real_data[:, i], kde=True, color='blue', alpha=0.5, ax=axes[i], label='Real Data')
        sns.histplot(generated_data[:, i], kde=True, color='red', alpha=0.5, ax=axes[i], label='Generated Data')
        axes[i].set_title(feature_names[i])
        axes[i].legend()

    plt.tight_layout()
    plt.show()


feature_names = ['Principal Component 1','Principal Component 2','Principal Component 3','Principal Component 4','Principal Component 5','Principal Component 6']



def compare_statistical_properties_federated(real_data, generated_data_federated):
    """
    Compare statistical properties of real and generated data for a federated setup.

    Args:
    - real_data: numpy array of real benign data
    - generated_data_federated: list of numpy arrays, each representing generated data from a QGAN instance
    """
    real_data = real_data.numpy() if torch.is_tensor(real_data) else real_data

    # Compute average generated data from all instances
    generated_data_avg = sum(generated_data_federated) / len(generated_data_federated)
    real_data = sum(real_data) / len(real_data)

    print("\nStatistical Comparisons (Federated):")
    print("\nMean:")
    print("Real:      ", np.mean(real_data, axis=0))
    print("Generated (Averaged):   ", np.mean(generated_data_avg, axis=0))

    print("\nStandard Deviation:")
    print("Real:      ", np.std(real_data, axis=0))
    print("Generated (Averaged):   ", np.std(generated_data_avg, axis=0))

    from scipy.stats import wasserstein_distance
    print("\nWasserstein Distance for Each Feature:")
    for i in range(real_data.shape[1]):
        dist = wasserstein_distance(real_data[:, i], generated_data_avg[:, i])
        print(f"Feature {i}: {dist}")

    # Optionally compare individual instances
    for idx, generated_data in enumerate(generated_data_federated):
        print(f"\n--- Instance {idx + 1} ---")
        print("Mean: ", np.mean(generated_data, axis=0))
        print("Std Dev: ", np.std(generated_data, axis=0))
        for i in range(real_data.shape[1]):
            dist = wasserstein_distance(real_data[:, i], generated_data[:, i])
            print(f"Feature {i}: Wasserstein Distance = {dist}")


In [None]:
# Lists to store losses for plotting (one per QGAN instance)
generator_loss_values_federated = [[] for _ in range(n_device)]
discriminator_loss_values_real_federated = [[] for _ in range(n_device)]
discriminator_loss_values_fake_federated = [[] for _ in range(n_device)]

title = "Federated QGAN Training Progress"

def plot_training_progress_federated():
    # Check if we have enough data for plotting
    #print(len(generator_loss_values_federated))
    #print(len(generator_loss_values_federated[0]))
    if any(len(losses) < 2 for losses in generator_loss_values_federated):
        print("NO")
        return

    # Compute average losses across all instances for plotting
    avg_generator_loss = [
        sum(generator_loss_values_federated[i][epoch] for i in range(n_device)) / n_device
        for epoch in range(len(generator_loss_values_federated[0]))
    ]
    avg_discriminator_loss_real = [
        sum(discriminator_loss_values_real_federated[i][epoch] for i in range(n_device)) / n_device
        for epoch in range(len(discriminator_loss_values_real_federated[0]))
    ]
    avg_discriminator_loss_fake = [
        sum(discriminator_loss_values_fake_federated[i][epoch] for i in range(n_device)) / n_device
        for epoch in range(len(discriminator_loss_values_fake_federated[0]))
    ]

    # Plot the average losses
    fig, ax1 = plt.subplots(1, 1, figsize=(6, 6))

    ax1.set_title(title)
    ax1.plot([x for x in avg_generator_loss], label="Generator Loss (Avg)", color="royalblue")
    ax1.plot([x for x in avg_discriminator_loss_real], label="Discriminator Loss Real (Avg)", color="green")
    ax1.plot([x for x in avg_discriminator_loss_fake], label="Discriminator Loss Fake (Avg)", color="red")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend()

    # Save the plot
    plt.savefig(title + '.png')

    # Show the plot
    plt.show()



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define thresholds to test
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

def evaluate_with_threshold_federated(threshold, benign_tensor, outlier_tensor, local_discriminators):
    """
    Evaluate models at a given threshold for a federated setup.
    
    Args:
    - threshold: Threshold to classify predictions.
    - benign_tensor: Tensor of benign data samples.
    - outlier_tensor: Tensor of outlier data samples.
    - local_discriminators: List of local discriminator models.
    
    Returns:
    - Federated overall metrics and per-model metrics.
    """
    # Store metrics for each local model
    metrics_per_model = []
    all_predictions = []
    all_labels = []
    
    for i, discriminator in enumerate(local_discriminators):
        # Get predictions for benign and outlier
        benign_predictions = discriminator(benign_tensor).detach()
        outlier_predictions = discriminator(outlier_tensor).detach()

        # Binary classifications using the threshold
        benign_predictions_binary = (benign_predictions > threshold).float()
        outlier_predictions_binary = (outlier_predictions > threshold).float()

        # Labels
        benign_labels = torch.ones(benign_tensor.shape[0])
        outlier_labels = torch.zeros(outlier_tensor.shape[0])

        # Compute metrics for this model
        benign_accuracy = accuracy_score(benign_labels, benign_predictions_binary)
        outlier_accuracy = accuracy_score(outlier_labels, outlier_predictions_binary)
        all_model_predictions = torch.cat((benign_predictions_binary, outlier_predictions_binary))
        all_model_labels = torch.cat((benign_labels, outlier_labels))

        overall_accuracy = accuracy_score(all_model_labels, all_model_predictions)
        precision = precision_score(all_model_labels, all_model_predictions, zero_division=0)
        recall = recall_score(all_model_labels, all_model_predictions)
        f1 = f1_score(all_model_labels, all_model_predictions)

        metrics_per_model.append((overall_accuracy, benign_accuracy, outlier_accuracy, precision, recall, f1))

        # Append predictions and labels for federated aggregation
        all_predictions.append(all_model_predictions)
        all_labels.append(all_model_labels)

    # Federated evaluation: aggregate predictions from all models
    #combined_predictions = torch.cat(all_predictions).mean(dim=0) > threshold
    combined_predictions = torch.cat(all_predictions)
    combined_labels = torch.cat(all_labels)
    
    #print(combined_labels)
    #print(combined_predictions)
    federated_overall_accuracy = accuracy_score(combined_labels, combined_predictions)
    federated_precision = precision_score(combined_labels, combined_predictions,zero_division=0)
    federated_recall = recall_score(combined_labels, combined_predictions)
    federated_f1 = f1_score(combined_labels, combined_predictions)

    return federated_overall_accuracy, federated_precision, federated_recall, federated_f1, metrics_per_model





In [None]:
# Convert benign_data into a torch tensor
benign_data_tensor = torch.tensor(benign_data, dtype=torch.float32)


from multiprocessing import Process, Manager
from copy import deepcopy

# Training parameters
num_epochs = 80
batch_size = 64
num_elements = 2 ** num_features

# Separate datasets for federated training
datasets = [
    (
        benign_data_tensor[i * len(benign_data_tensor) // n_device : (i + 1) * len(benign_data_tensor) // n_device],
        outlier_tensor[i * len(outlier_tensor) // n_device : (i + 1) * len(outlier_tensor) // n_device]
    )
    for i in range(n_device)
]
split_data = np.array_split(benign_data_tensor, n_device)

def train_qgan(index, datasets, generators, discriminators, optimizers_g, optimizers_d, results, num_features, num_elements, batch_size, num_epochs, criterion, real_label, fake_label):
    benign_data, outlier_data = datasets[index]
    generator = generators[index]
    discriminator = discriminators[index]
    optimizer_g = optimizers_g[index]
    optimizer_d = optimizers_d[index]


    for j in range(0, len(benign_data), batch_size):
        real_data = benign_data[j:j + batch_size]
        batch_size_real = real_data.size(0)
        label_real = torch.full((batch_size_real,), real_label, dtype=torch.float32)

        # Discriminator training on real data
        optimizer_d.zero_grad()
        output_real = discriminator(real_data).view(-1)
        loss_real = criterion(output_real, label_real)
        loss_real.backward()

        # Fake data generation
        noise = torch.randn(batch_size_real, num_features)
        fake_data = generator(noise).detach()
        transformed_data = torch.zeros((batch_size_real, num_features))

        for i in range(num_features):
            for row in range(batch_size_real):
                sum_value = 0
                for start in range(0, num_elements, 2**(i + 1)):
                    sum_value += fake_data[row, start:start + 2**(i)].sum()
                transformed_data[row, i] = sum_value
        fake_data = transformed_data

        label_fake = torch.full((batch_size_real,), fake_label, dtype=torch.float32)
        output_fake = discriminator(fake_data).view(-1)
        loss_fake = criterion(output_fake, label_fake)
        loss_fake.backward()
        optimizer_d.step()

        # Generator training
        optimizer_g.zero_grad()
        fake_data = generator(noise)

        for i in range(num_features):
            for row in range(batch_size_real):
                sum_value = 0
                for start in range(0, num_elements, 2**(i + 1)):
                    sum_value += fake_data[row, start:start + 2**(i)].sum()
                transformed_data[row, i] = sum_value
        fake_data = transformed_data

        label_gen = torch.full((batch_size_real,), real_label, dtype=torch.float32)
        output_gen = discriminator(fake_data).view(-1)
        loss_gen = criterion(output_gen, label_gen)
        loss_gen.backward()
        optimizer_g.step()

        #print(loss_gen)
        #generator_loss_values_federated[index].append(loss_gen.item())
        #discriminator_loss_values_real_federated[index].append(loss_real.item())
        #discriminator_loss_values_fake_federated[index].append(loss_fake.item())
    generator_loss_values=loss_gen.item()
    discriminator_loss_values_real=loss_real.item()
    discriminator_loss_values_fake=loss_fake.item()

    
    with torch.no_grad():
        # Generate noise for this instance
        noise = torch.randn(test_size, num_features)
    
        # Generate samples using the generator of this instance
        generated_samples = generator(noise).detach()

        # Transform the generated data
        transformed_data = torch.zeros((test_size, num_features))
        for i in range(num_features):
            for row in range(test_size):
                sum_value = 0
                for start in range(0, num_elements,  2**(i + 1)): 
                    sum_value += generated_samples[row, start:start + 2**(i)].sum()
                transformed_data[row, i] = sum_value

        # Convert to numpy and store in the list
        generated_data=transformed_data.numpy()

    results[index] = {
        "generator": deepcopy(generator.state_dict()),
        "discriminator": deepcopy(discriminator.state_dict()),
        "generated": generated_data,
        "lossg": generator_loss_values,
        "lossr": discriminator_loss_values_real,
        "lossf": discriminator_loss_values_fake,
    }

In [None]:
%%time

manager = Manager()
results = manager.dict()

for epoch in range(num_epochs):
    processes = []
    for i in range(len(datasets)): #n_device
        process = Process(
            target=train_qgan,
            args=(i, datasets, generators, discriminators, optimizers_g, optimizers_d, results, num_features, num_elements, batch_size, num_epochs, criterion, real_label, fake_label)
        )
        processes.append(process)
        process.start()

    for process in processes:
        process.join()

    generated_data_federated=[]
    for i in range(n_device):
        generated_data_federated.append(results[i]["generated"]) 
        generator_loss_values_federated[i].append(results[i]["lossg"])
        discriminator_loss_values_real_federated[i].append(results[i]["lossr"])
        discriminator_loss_values_fake_federated[i].append(results[i]["lossf"])

    compare_distributions_federated(split_data, generated_data_federated, feature_names)
    #compare_statistical_properties_federated(split_data, generated_data_federated)
    plot_training_progress_federated()

    # Execution loop for thresholds
    for threshold in thresholds:
        federated_overall_accuracy, federated_precision, federated_recall, federated_f1, metrics_per_model = evaluate_with_threshold_federated(
            threshold, benign_tensor, outlier_tensor, discriminators
        )

        print(f"Threshold: {threshold:.1f} | Federated Accuracy: {federated_overall_accuracy:.4f} | Precision: {federated_precision:.4f} | Recall: {federated_recall:.4f} | F1 Score: {federated_f1:.4f}")

        # Print per-model metrics
        for model_idx, (overall, benign_acc, outlier_acc, precision, recall, f1) in enumerate(metrics_per_model):
            print(f"  Model {model_idx + 1}: Overall Accuracy: {overall:.4f} | Benign Accuracy: {benign_acc:.4f} | Outlier Accuracy: {outlier_acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")
    

    avg_loss_gen = sum([generator_loss_values_federated[i][epoch] for i in range(n_device)]) / n_device
    avg_loss_real = sum([discriminator_loss_values_real_federated[i][epoch] for i in range(n_device)]) / n_device
    avg_loss_fake = sum([discriminator_loss_values_fake_federated[i][epoch] for i in range(n_device)]) / n_device

    # Print the averaged losses for the current epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}] - Avg Last Loss D real: {avg_loss_real:.4f}, Avg Last Loss D fake: {avg_loss_fake:.4f}, Avg Last Loss G: {avg_loss_gen:.4f}")

    with torch.no_grad():
        avg_generator_weights = {
            k: sum(results[i]["generator"][k] for i in range(len(datasets))) / len(datasets) for k in results[0]["generator"]
        }
        avg_discriminator_weights = {
            k: sum(results[i]["discriminator"][k] for i in range(len(datasets))) / len(datasets) for k in results[0]["discriminator"]
        }

        for generator, discriminator in zip(generators, discriminators):
            generator.load_state_dict(avg_generator_weights)
            discriminator.load_state_dict(avg_discriminator_weights)

    


In [None]:
'''
#testare tutto con seed iniziale diverso e calcolare media e std dev
score = evaluate_model(global_model)  # Valuta il modello federato
results.append((run, seed_client, score))  # Salva il risultato
mean_score = np.mean(df_results["Score"])
std_dev = np.std(df_results["Score"])
print(f"Media: {mean_score:.4f}, Deviazione standard: {std_dev:.4f}")

#piu dati
#verificare tutto il codice
'''