In [2]:
import pandas as pd

import torch

from sklearn.preprocessing import MinMaxScaler

from torch.utils.data import DataLoader, TensorDataset



# # Load your data

# data_path = '/kaggle/input/gan3-training-data/train4_clean.csv'  # Update with the actual path

# data = pd.read_csv(data_path)

# data = data.drop(['Attack'], axis = 1)


# # Remove the timestamp if it's in your data

# if 'timestamp' in data.columns:

#     data = data.drop(columns=['timestamp'])



# # Normalize the data

# scaler = MinMaxScaler(feature_range=(-1, 1))

# normalized_data = scaler.fit_transform(data.values)



# # Convert to PyTorch tensor

# tensor_data = torch.tensor(normalized_data, dtype=torch.float32).unsqueeze(1)  # Add channel dimension

# train_loader = DataLoader(TensorDataset(tensor_data), batch_size=64, shuffle=True)


In [3]:
#------------------------------------------
# Processing Data for GAN training
#------------------------------------------
import pandas as pd
import torch

# Load the selected datasets
train_set_1 = pd.read_csv('/kaggle/input/gan3-training-data/train1_clean.csv')
train_set_2 = pd.read_csv('/kaggle/input/gan3-training-data/train4_clean.csv')

print(train_set_2.shape)

# Step 1: Find common columns
common_columns = train_set_1.columns.intersection(train_set_2.columns)
print(f"Common columns between datasets: {common_columns}")
print(f"There are {len(common_columns)}")

# Filter both datasets to keep only the common columns
train_set_1 = train_set_1[common_columns]
train_set_2 = train_set_2[common_columns]

# print(f"{train_set_1.head()}")
# print(f"{train_set_2.head()}")
train_set_2.shape

(86401, 43)
Common columns between datasets: Index(['P1_B2016', 'P1_FCV03D', 'P1_FCV03Z', 'P1_FT01', 'P1_FT01Z', 'P1_FT02',
       'P1_FT02Z', 'P1_FT03', 'P1_FT03Z', 'P1_LIT01', 'P1_PCV02D', 'P1_PCV02Z',
       'P1_PIT01', 'P1_PIT02', 'P1_TIT01', 'P1_TIT02', 'P2_24Vdc', 'P2_AutoGO',
       'P2_SIT01', 'P2_TripEx', 'P3_FIT01', 'P3_LCP01D', 'P3_LCV01D',
       'P3_LIT01', 'P3_PIT01', 'P4_HT_FD', 'P4_HT_PO', 'P4_ST_FD', 'P4_ST_GOV',
       'P4_ST_LD', 'P4_ST_PO', 'P4_ST_PT01', 'P4_ST_TT01'],
      dtype='object')
There are 33


(86401, 33)

In [4]:
# Normalize the data

scaler = MinMaxScaler(feature_range=(-1, 1))

normalized_data = scaler.fit_transform(train_set_2.values)



# Convert to PyTorch tensor

tensor_data = torch.tensor(normalized_data, dtype=torch.float32).unsqueeze(1)  # Add channel dimension

train_loader = DataLoader(TensorDataset(tensor_data), batch_size=64, shuffle=True)

In [5]:
import torch

import torch.nn as nn

import torch.optim as optim

import pandas as pd

from sklearn.preprocessing import MinMaxScaler



# Set device to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# Parameters

input_dim = 100               # Dimension of the input noise vector for the generator

feature_dim = 16              # Adjusted base feature size for convolutional layers

num_epochs = 50            # Number of training epochs for longer training

batch_size = 32               # Batch size

generator_lr = 0.0003         # Increased learning rate for the generator

discriminator_lr = 0.0001     # Decreased learning rate for the discriminator

noise_std_dev = 0.01          # Standard deviation for noise added to real data



# Discriminator Model

class Discriminator(nn.Module):

    def __init__(self):

        super(Discriminator, self).__init__()

        self.conv_layers = nn.Sequential(

            nn.Conv1d(1, feature_dim, kernel_size=4, stride=2, padding=1),

            nn.LeakyReLU(0.2),

            nn.Conv1d(feature_dim, feature_dim * 2, kernel_size=4, stride=2, padding=1),

            nn.LeakyReLU(0.2),

            nn.Dropout(0.5),

            nn.Conv1d(feature_dim * 2, feature_dim * 4, kernel_size=3, stride=2, padding=1),

            nn.LeakyReLU(0.2),

            nn.Dropout(0.5),

            nn.Conv1d(feature_dim * 4, feature_dim * 8, kernel_size=3, stride=2, padding=1),

            nn.LeakyReLU(0.2),

            nn.Conv1d(feature_dim * 8, feature_dim * 16, kernel_size=2, stride=2, padding=1),

            nn.LeakyReLU(0.2),

            nn.Conv1d(feature_dim * 16, feature_dim * 32, kernel_size=2, stride=2, padding=1),

            nn.LeakyReLU(0.2),

            nn.Conv1d(feature_dim * 32, 1, kernel_size=2, stride=1, padding=0)

        )

        self.fc_layers = nn.Sequential(

            nn.Linear(1, 1),

            nn.Sigmoid()

        )



    def forward(self, x):

        x = self.conv_layers(x)

        x = x.view(x.size(0), -1)

        return self.fc_layers(x)



# Generator Model

class Generator(nn.Module):

    def __init__(self):

        super(Generator, self).__init__()

        self.fc_layers = nn.Sequential(

            nn.Linear(input_dim, feature_dim * 16),

            nn.ReLU(),

            nn.Linear(feature_dim * 16, feature_dim * 8),

            nn.ReLU(),

            nn.Linear(feature_dim * 8, feature_dim * 4),

            nn.ReLU()

        )

        self.deconv_layers = nn.Sequential(

            nn.Upsample(scale_factor=2),

            nn.ConvTranspose1d(feature_dim * 4, feature_dim * 2, 4, stride=2, padding=1),

            nn.ReLU(),

            nn.Upsample(scale_factor=2),

            nn.ConvTranspose1d(feature_dim * 2, feature_dim, 4, stride=2, padding=1),

            nn.ReLU(),

            nn.ConvTranspose1d(feature_dim, 1, kernel_size=4, stride=2, padding=1),

            nn.Tanh()  # Output in range [-1, 1]

        )



    def forward(self, x):

        x = self.fc_layers(x)

        x = x.view(x.size(0), -1, 1)

        return self.deconv_layers(x)



# Initialize models

discriminator = Discriminator().to(device)

generator = Generator().to(device)



# Loss and Optimizer

criterion = nn.BCELoss()  # Binary Cross Entropy for GAN

optimizer_d = optim.Adam(discriminator.parameters(), lr=discriminator_lr, betas=(0.5, 0.999))

optimizer_g = optim.Adam(generator.parameters(), lr=generator_lr, betas=(0.5, 0.999))



# Training the GAN with added noise to real data for regularization

for epoch in range(num_epochs):

    for i, batch in enumerate(train_loader):

        real_data = batch[0].to(device)



        # Train Discriminator

        optimizer_d.zero_grad()



        # Add slight noise to real data to regularize the discriminator

        real_data_noisy = real_data + torch.normal(0, noise_std_dev, real_data.shape).to(device)



        # Real and fake labels

        real_labels = torch.full((real_data.size(0), 1), 0.9).to(device)  # Label smoothing for real data

        fake_labels = torch.full((real_data.size(0), 1), 0.1).to(device)  # Label smoothing for fake data



        # Discriminator on real data with noise

        outputs = discriminator(real_data_noisy)

        d_loss_real = criterion(outputs, real_labels)



        # Discriminator on fake data

        noise = torch.randn(real_data.size(0), input_dim).to(device)

        fake_data = generator(noise)

        outputs = discriminator(fake_data.detach())

        d_loss_fake = criterion(outputs, fake_labels)



        # Backprop and optimize discriminator

        d_loss = d_loss_real + d_loss_fake

        d_loss.backward()

        optimizer_d.step()



        # Train Generator

        optimizer_g.zero_grad()

        outputs = discriminator(fake_data)

        g_loss = criterion(outputs, real_labels)  # Fool discriminator into thinking fake data is real



        # Backprop and optimize generator

        g_loss.backward()

        optimizer_g.step()



        # Print progress every 10 batches
        if i % 1000 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}], d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}")

        # if (epoch + 1) % 10 == 0:
    
        #     print(f"Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}")



print("Training complete. Use generator to create synthetic data.")


Epoch [1/50], Batch [1], d_loss: 1.3991, g_loss: 0.7896
Epoch [1/50], Batch [1001], d_loss: 1.2667, g_loss: 0.9779
Epoch [2/50], Batch [1], d_loss: 1.3028, g_loss: 0.7847
Epoch [2/50], Batch [1001], d_loss: 1.3365, g_loss: 0.7805
Epoch [3/50], Batch [1], d_loss: 1.2634, g_loss: 0.7876
Epoch [3/50], Batch [1001], d_loss: 1.0178, g_loss: 1.3465
Epoch [4/50], Batch [1], d_loss: 0.9522, g_loss: 1.2348
Epoch [4/50], Batch [1001], d_loss: 0.9421, g_loss: 1.4525
Epoch [5/50], Batch [1], d_loss: 0.8441, g_loss: 1.9325
Epoch [5/50], Batch [1001], d_loss: 0.8354, g_loss: 1.6994
Epoch [6/50], Batch [1], d_loss: 0.8919, g_loss: 1.7740
Epoch [6/50], Batch [1001], d_loss: 0.7530, g_loss: 1.7965
Epoch [7/50], Batch [1], d_loss: 0.7779, g_loss: 1.9778
Epoch [7/50], Batch [1001], d_loss: 0.6791, g_loss: 1.8916
Epoch [8/50], Batch [1], d_loss: 0.7421, g_loss: 2.2674
Epoch [8/50], Batch [1001], d_loss: 0.7313, g_loss: 1.9498
Epoch [9/50], Batch [1], d_loss: 0.6855, g_loss: 2.0632
Epoch [9/50], Batch [100

In [5]:
# Generate synthetic data for three days

samples_per_day = 86201  # Update based on your data

days_to_generate = 3

total_samples = samples_per_day * days_to_generate



noise = torch.randn(total_samples, input_dim).to(device)

synthetic_data = generator(noise).detach().cpu().numpy()

synthetic_data_reshaped = synthetic_data.reshape(total_samples, -1)



# Save to CSV

synthetic_df = pd.DataFrame(synthetic_data_reshaped)

synthetic_df.to_csv("synthetic_data.csv", index=False)



print("Synthetic data saved as 'synthetic_data.csv'")


Synthetic data saved as 'synthetic_data.csv'


In [7]:
import pandas as pd



# Load the second train set (train_set_2) as a DataFrame

train_set_2_path = 'G:/stuProj/data/train4_clean.csv'  # Update with the actual path

train_set_2 = pd.read_csv(train_set_2_path)



# Ensure only relevant columns (assumes synthetic data has 32 columns)

train_set_2 = train_set_2[train_set_2.columns[:32]]  # Adjust column count as needed



# Reshape synthetic data and convert to DataFrame

synthetic_data_reshaped = synthetic_data.reshape(synthetic_data.shape[0], -1)

synthetic_data_df = pd.DataFrame(synthetic_data_reshaped, columns=train_set_2.columns)



# Concatenate synthetic data to train_set_2

combined_data = pd.concat([train_set_2, synthetic_data_df], ignore_index=True)

print("Combined dataset shape:", combined_data.shape)


Combined dataset shape: (86701, 32)


In [9]:
# Create two three-day subsets from the original train_set_2

samples_per_day = 100  # Adjust based on your data

subset_1 = train_set_2.iloc[:samples_per_day * 3]  # First three days

subset_2 = train_set_2.iloc[samples_per_day * 3:samples_per_day * 6]  # Second three days



# The third set is the synthetic three-day data

subset_3 = synthetic_data_df


In [11]:
from sklearn.preprocessing import MinMaxScaler



# Define a MinMaxScaler instance

scaler = MinMaxScaler()



# Normalize each subset

normalized_subset_1 = pd.DataFrame(scaler.fit_transform(subset_1), columns=subset_1.columns)

normalized_subset_2 = pd.DataFrame(scaler.fit_transform(subset_2), columns=subset_2.columns)

normalized_subset_3 = pd.DataFrame(scaler.fit_transform(subset_3), columns=subset_3.columns)


In [13]:
from scipy.stats import ks_2samp



# Function to calculate K-S statistics between each pair of datasets

def compute_ks_statistics(df1, df2):

    ks_results = []

    for column in df1.columns:

        ks_stat, p_value = ks_2samp(df1[column], df2[column])

        ks_results.append({

            'sensor': column,

            'ks_statistic': ks_stat,

            'p_value': p_value,

            'passes': ks_stat < 0.15 and p_value > 0.03  # Define pass criteria

        })

    return pd.DataFrame(ks_results)



# Compare normalized subsets

ks_results_real_real = compute_ks_statistics(normalized_subset_1, normalized_subset_2)

ks_results_real_synthetic = compute_ks_statistics(normalized_subset_1, normalized_subset_3)

ks_results_real_synthetic_2 = compute_ks_statistics(normalized_subset_2, normalized_subset_3)



# Count the number of sensors that pass for each comparison

passes_real_real = ks_results_real_real['passes'].sum()

passes_real_synthetic = ks_results_real_synthetic['passes'].sum()

passes_real_synthetic_2 = ks_results_real_synthetic_2['passes'].sum()



print("Number of sensors passing K-S test (real-real):", passes_real_real)

print("Number of sensors passing K-S test (real-synthetic 1):", passes_real_synthetic)

print("Number of sensors passing K-S test (real-synthetic 2):", passes_real_synthetic_2)


Number of sensors passing K-S test (real-real): 12
Number of sensors passing K-S test (real-synthetic 1): 5
Number of sensors passing K-S test (real-synthetic 2): 4


In [15]:
print("\nDetailed K-S Test Results (Real-Real):")

print(ks_results_real_real)



print("\nDetailed K-S Test Results (Real-Synthetic 1):")

print(ks_results_real_synthetic)



print("\nDetailed K-S Test Results (Real-Synthetic 2):")

print(ks_results_real_synthetic_2)



Detailed K-S Test Results (Real-Real):
        sensor  ks_statistic        p_value  passes
0     P1_B2016      0.106667   6.580850e-02    True
1    P1_FCV03D      0.136667   7.311227e-03   False
2    P1_FCV03Z      0.550000   4.911614e-42   False
3      P1_FT01      0.290000   1.598960e-11   False
4     P1_FT01Z      0.290000   1.598960e-11   False
5      P1_FT02      0.810000   5.295808e-99   False
6     P1_FT02Z      0.810000   5.295808e-99   False
7      P1_FT03      0.646667   2.030859e-59   False
8     P1_FT03Z      0.650000   4.348910e-60   False
9     P1_LIT01      0.213333   2.166454e-06   False
10   P1_PCV02D      0.000000   1.000000e+00    True
11   P1_PCV02Z      0.916667  1.634256e-135   False
12    P1_PIT01      0.243333   3.324740e-08   False
13    P1_PIT02      0.606667   9.173810e-52   False
14     P1_PP04      0.996667  8.881787e-177   False
15   P1_PP04SP      0.000000   1.000000e+00    True
16    P1_TIT01      0.383333   4.914991e-20   False
17    P1_TIT02      0.07