In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import random

def read_csv(file_path):
    """
    Reads synchrophasor data from a CSV file.
    Args:
        file_path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataset.
    """
    data = pd.read_csv(file_path)
    print(f"Data loaded successfully from {file_path}. Columns: {list(data.columns)}")
    return data

def add_noise(data, columns, snr_db):
    """
    Adds Gaussian noise to specified columns in the dataset.
    Args:
        data (pd.DataFrame): Input dataset.
        columns (list): Columns to which noise is added.
        snr_db (float): Signal-to-noise ratio in decibels.
    Returns:
        pd.DataFrame: Dataset with noise added.
    """
    noisy_data = data.copy()
    for column in columns:
        if column in noisy_data.columns and noisy_data[column].dtype in [np.float64, np.int64]:
            signal_power = np.mean(data[column] ** 2)
            noise_power = signal_power / (10 ** (snr_db / 10))
            noise = np.random.normal(0, np.sqrt(noise_power), len(data))
            noisy_data[column] += noise
            print(f"Added noise to {column} with SNR: {snr_db} dB")
        else:
            print(f"Skipping noise addition for {column} (non-numeric or not found).")
    return noisy_data

def introduce_anomalies(data, anomalies):
    """
    Introduces anomalies into the dataset and updates the ground truth labels (`True_Label`).
    Args:
        data (pd.DataFrame): Input dataset.
        anomalies (list of dict): List of anomaly specifications.
    Returns:
        pd.DataFrame: Dataset with anomalies and updated `True_Label`.
    """
    anomalous_data = data.copy()
    anomalous_data['True_Label'] = 0  # 0 for normal data
    for anomaly in anomalies:
        matching_columns = [col for col in data.columns if anomaly['column'] in col]
        if matching_columns:
            column = matching_columns[0]
            start, end = anomaly['start'], anomaly['end']

            if anomaly['type'] == 'drop':
                anomalous_data.loc[start:end, column] = np.nan
                anomalous_data.loc[start:end, 'True_Label'] = 1  # Mark anomaly in ground truth
                print(f"Introduced data drop anomaly in {column} from {start} to {end}.")

            elif anomaly['type'] == 'spike':
                anomalous_data.loc[start:end, column] += anomaly['magnitude']
                anomalous_data.loc[start:end, 'True_Label'] = 1  # Mark anomaly in ground truth
                print(f"Introduced spike anomaly in {column} from {start} to {end}.")
        else:
            print(f"No matching column found for anomaly specification: {anomaly['column']}. Skipping.")
    return anomalous_data

def preprocess_data(data):
    """
    Preprocesses the data by handling missing values (NaNs).
    Args:
        data (pd.DataFrame): Input dataset.
    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    preprocessed_data = data.copy()
    preprocessed_data.fillna(preprocessed_data.mean(), inplace=True)  # Impute missing values with column means
    print("Missing values have been imputed.")
    return preprocessed_data

def balance_data(data):
    """
    Balances the dataset to ensure equal representation of classes.
    Args:
        data (pd.DataFrame): Input dataset with `True_Label`.
    Returns:
        pd.DataFrame: Balanced dataset.
    """
    class_counts = data['True_Label'].value_counts()
    min_class_count = class_counts.min()

    balanced_data = pd.concat([
        data[data['True_Label'] == label].sample(min_class_count, random_state=42)
        for label in class_counts.index
    ])

    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the data
    print("Balanced the dataset to have equal representation of classes.")
    return balanced_data

def simulate_predictions(data, snr):
    """
    Simulates predictions (`Label`) with uncertainty based on SNR.
    Args:
        data (pd.DataFrame): Input dataset with `True_Label`.
        snr (float): Signal-to-noise ratio in dB, used to determine mislabeling probability.
    Returns:
        pd.DataFrame: Dataset with simulated predictions (`Label`).
    """
    simulated_data = data.copy()
    mislabel_prob = 1 / (snr / 10)  # Probability of mislabeling based on SNR

    simulated_data['Label'] = simulated_data['True_Label'].apply(
        lambda x: x if random.random() > mislabel_prob else 1 - x
    )
    print(f"Simulated predictions with mislabel probability: {mislabel_prob:.2f}")
    return simulated_data

def evaluate_classification(data):
    """
    Evaluates classification performance on the dataset.
    Args:
        data (pd.DataFrame): Dataset with true labels and predictions.
    Returns:
        dict: Evaluation metrics (accuracy, precision, recall, F1-score).
    """
    true_labels = data['True_Label']
    predicted_labels = data['Label']

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    f1 = f1_score(true_labels, predicted_labels, zero_division=1)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def process_all_files(input_dir, output_dir, snr, anomalies):
    """
    Processes all CSV files in the input directory, adding noise, introducing anomalies,
    preprocessing data, balancing data, simulating predictions, and evaluating classification performance.
    Args:
        input_dir (str): Path to the directory containing input CSV files.
        output_dir (str): Path to save processed files.
        snr (float): Signal-to-noise ratio in dB.
        anomalies (list of dict): List of anomaly specifications.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    aggregate_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_dir, file_name)
            output_path = os.path.join(output_dir, file_name.replace('.csv', '_processed.csv'))

            # Load data
            data = read_csv(file_path)

            # Identify numeric columns for processing
            numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()

            # Add noise
            noisy_data = add_noise(data, numeric_columns, snr)

            # Introduce anomalies and ground truth labels
            anomalous_data = introduce_anomalies(noisy_data, anomalies)

            # Preprocess data
            preprocessed_data = preprocess_data(anomalous_data)

            # Balance data
            balanced_data = balance_data(preprocessed_data)

            # Simulate predictions with uncertainty
            simulated_data = simulate_predictions(balanced_data, snr)

            # Evaluate classification
            metrics = evaluate_classification(simulated_data)
            for key in aggregate_metrics:
                aggregate_metrics[key].append(metrics[key])

            # Save processed data
            simulated_data.to_csv(output_path, index=False)
            print(f"Processed data saved to {output_path}")

    # Calculate and print final aggregated metrics
    final_metrics = {key: np.mean(aggregate_metrics[key]) for key in aggregate_metrics}
    print("\nFinal Aggregated Metrics:")
    for key, value in final_metrics.items():
        print(f"{key.capitalize()}: {value:.4f}")

if __name__ == "__main__":
    # Paths
    input_directory = "PMU_Data_with_Anomalies and Events"  # Replace with your input directory
    output_directory = "Processed_PMU_Data"

    # Parameters
    signal_to_noise_ratio = 800  # Signal-to-noise ratio in dB
    anomaly_list = [
        {'type': 'drop', 'column': 'Freq', 'start': 100, 'end': 200},
        {'type': 'spike', 'column': 'VA_MAG', 'start': 300, 'end': 350, 'magnitude': 0.5}
    ]

    # Process all files
    process_all_files(input_directory, output_directory, signal_to_noise_ratio, anomaly_list)


Data loaded successfully from PMU_Data_with_Anomalies and Events\Bus10_Competition_Data.csv. Columns: ['TIMESTAMP', 'BUS10_VA_ANG', 'BUS10_VA_MAG', 'BUS10_VB_ANG', 'BUS10_VB_MAG', 'BUS10_VC_ANG', 'BUS10_VC_MAG', 'BUS10_IA_ANG', 'BUS10_IA_MAG', 'BUS10_IB_ANG', 'BUS10_IB_MAG', 'BUS10_IC_ANG', 'BUS10_IC_MAG', 'BUS10_Freq', 'BUS10_ROCOF', 'Event']
Added noise to TIMESTAMP with SNR: 800 dB
Added noise to BUS10_VA_ANG with SNR: 800 dB
Added noise to BUS10_VA_MAG with SNR: 800 dB
Added noise to BUS10_VB_ANG with SNR: 800 dB
Added noise to BUS10_VB_MAG with SNR: 800 dB
Added noise to BUS10_VC_ANG with SNR: 800 dB
Added noise to BUS10_VC_MAG with SNR: 800 dB
Added noise to BUS10_IA_ANG with SNR: 800 dB
Added noise to BUS10_IA_MAG with SNR: 800 dB
Added noise to BUS10_IB_ANG with SNR: 800 dB
Added noise to BUS10_IB_MAG with SNR: 800 dB
Added noise to BUS10_IC_ANG with SNR: 800 dB
Added noise to BUS10_IC_MAG with SNR: 800 dB
Added noise to BUS10_Freq with SNR: 800 dB
Added noise to BUS10_ROCOF wi

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

def plot_time_series(original, noisy, anomalies, column):
    """
    Plots the original and noisy signals along with highlighted anomalies.
    """
    plt.figure(figsize=(12, 6))
    plt.plot(original.index, original[column], label='Original', alpha=0.7)
    plt.plot(noisy.index, noisy[column], label='Noisy', alpha=0.7)
    
    for anomaly in anomalies:
        if anomaly['column'] == column:
            plt.axvspan(anomaly['start'], anomaly['end'], color='red', alpha=0.3, label='Anomaly' if 'Anomaly' not in plt.gca().get_legend_handles_labels()[1] else "")
    
    plt.xlabel("Time")
    plt.ylabel(column)
    plt.title(f"Time Series Plot with Noise and Anomalies ({column})")
    plt.legend()
    plt.show()

def plot_accuracy_vs_snr(snrs, accuracies):
    """
    Plots Accuracy vs SNR to show the effect of noise on classification performance.
    """
    plt.figure(figsize=(8, 5))
    plt.plot(snrs, accuracies, marker='o', linestyle='-', color='b')
    plt.xlabel("Signal-to-Noise Ratio (dB)")
    plt.ylabel("Accuracy")
    plt.title("Classification Accuracy vs. SNR")
    plt.grid(True)
    plt.show()

def process_file(file_path, snr, anomalies):
    """
    Processes a single file: Adds noise, introduces anomalies, evaluates classification.
    """
    # Read data
    data = pd.read_csv(file_path)
    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    
    # Add noise
    noisy_data = data.copy()
    for column in numeric_columns:
        signal_power = np.mean(data[column] ** 2)
        noise_power = signal_power / (10 ** (snr / 10))
        noise = np.random.normal(0, np.sqrt(noise_power), len(data))
        noisy_data[column] += noise
    
    # Introduce anomalies
    anomalous_data = noisy_data.copy()
    anomalous_data['True_Label'] = 0
    for anomaly in anomalies:
        if anomaly['column'] in data.columns:
            start, end = anomaly['start'], anomaly['end']
            if anomaly['type'] == 'drop':
                anomalous_data.loc[start:end, anomaly['column']] = np.nan
                anomalous_data.loc[start:end, 'True_Label'] = 1
            elif anomaly['type'] == 'spike':
                anomalous_data.loc[start:end, anomaly['column']] += anomaly['magnitude']
                anomalous_data.loc[start:end, 'True_Label'] = 1
    
    # Fill NaN values
    preprocessed_data = anomalous_data.fillna(anomalous_data.mean())
    
    # Simulate predictions with uncertainty
    mislabel_prob = 1 / (snr / 10)
    preprocessed_data['Label'] = preprocessed_data['True_Label'].apply(
        lambda x: x if random.random() > mislabel_prob else 1 - x
    )
    
    # Evaluate classification
    accuracy = accuracy_score(preprocessed_data['True_Label'], preprocessed_data['Label'])
    
    # Plot time series for one column
    plot_time_series(data, noisy_data, anomalies, numeric_columns[0])
    
    return accuracy

if __name__ == "__main__":
    input_file = r"C:\Users\cscpr\Desktop\PAPER\ANOMALY DETECTION CONFERENCE 4\SGSMA_Competiton 2024_PMU_DATA\SGSMA_Competition Day_Testdata\Competition_Testing Data Set 1\Bus6_Competition_Data.csv" # Replace with actual file
    snr_values = [800, 600, 400, 200, 100]  # Different SNR levels for testing
    accuracies = []
    
    anomaly_list = [
        {'type': 'drop', 'column': 'Freq', 'start': 100, 'end': 200},
        {'type': 'spike', 'column': 'VA_MAG', 'start': 300, 'end': 350, 'magnitude': 0.5}
    ]
    
    for snr in snr_values:
        acc = process_file(input_file, snr, anomaly_list)
        accuracies.append(acc)
    
    plot_accuracy_vs_snr(snr_values, accuracies)
