In [93]:
import pandas as pd
import os

# Define the file path for train1.csv
file_path = '/Users/darth/Dev/stuProj/data/GAN/train1_clean.csv'
# Define the specific number of rows to select
rows_to_select = 10000

# Load the CSV file
df = pd.read_csv(file_path)

# Select only the specified number of rows
df_selected = df.iloc[:rows_to_select]

# Save the filtered data to a new file with "_86402rows" appended to the filename
output_path = file_path.replace('.csv', '_onedayrows.csv')
df_selected.to_csv(output_path, index=False)

print(f"Processed {file_path} and saved the selected 86,402 rows to {output_path}")


Processed /Users/darth/Dev/stuProj/data/GAN/train1_clean.csv and saved the selected 86,402 rows to /Users/darth/Dev/stuProj/data/GAN/train1_clean_onedayrows.csv


In [94]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Load the one-day dataset (from previous steps) and select the columns with physical readings
file_path = '/Users/darth/Dev/stuProj/data/GAN/train1_clean_onedayrows.csv'  # Replace with actual file path
data = pd.read_csv(file_path)

# Assuming 'timestamp' is a column, drop it or keep only relevant columns for GAN training
if 'timestamp' in data.columns:
    data = data.drop(columns=['timestamp'])

# Normalize the data to the range [-1, 1] (matching Tanh output in the Generator)
scaler = MinMaxScaler(feature_range=(-1, 1))
normalized_data = scaler.fit_transform(data.values)  # Apply normalbization

# Convert the normalized data to a PyTorch tensor
tensor_data = torch.tensor(normalized_data, dtype=torch.float32).unsqueeze(1)  # Add channel dimension

# Create a TensorDataset and DataLoader
batch_size = 128  # Set your batch size here
dataset = TensorDataset(tensor_data)  # Dataset for DataLoader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Example of checking batch shapes
for batch in train_loader:
    print("Batch shape:", batch[0].shape)  # Expected: (batch_size, channels, features)
    break

Batch shape: torch.Size([128, 1, 49])


In [95]:
import torch
import torch.nn as nn
import torch.optim as optim

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Parameters
input_dim = 100               # Dimension of the input noise vector for the generator
feature_dim = 16              # Adjusted base feature size for convolutional layers
num_epochs = 3            # Number of training epochs
batch_size = 16              # Batch size
learning_rate = 0.0002        # Learning rate

# Discriminator Model
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, feature_dim, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv1d(feature_dim, feature_dim * 2, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Conv1d(feature_dim * 2, feature_dim * 4, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Conv1d(feature_dim * 4, feature_dim * 8, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv1d(feature_dim * 8, feature_dim * 16, kernel_size=2, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv1d(feature_dim * 16, feature_dim * 32, kernel_size=2, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv1d(feature_dim * 32, 1, kernel_size=2, stride=1, padding=0)
        )

        # Initialize fc_layers with the correct input size once known
        self.fc_layers = nn.Sequential(
            nn.Linear(1, 1),  # Placeholder; will be updated based on output shape after conv layers
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv_layers(x)
        #print("Shape after conv layers:", x.shape)  # Print shape to confirm
        x = x.view(x.size(0), -1)  # Flatten for the fully connected layer
        return self.fc_layers(x)

# Generator Model
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.fc_layers = nn.Sequential(
            nn.Linear(input_dim, feature_dim * 16),
            nn.ReLU(),
            nn.Linear(feature_dim * 16, feature_dim * 8),
            nn.ReLU(),
            nn.Linear(feature_dim * 8, feature_dim * 4),
            nn.ReLU()
        )
        self.deconv_layers = nn.Sequential(
            nn.Upsample(scale_factor=2),
            nn.ConvTranspose1d(feature_dim * 4, feature_dim * 2, 4, stride=2, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.ConvTranspose1d(feature_dim * 2, feature_dim, 4, stride=2, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(feature_dim, 1, kernel_size=4, stride=2, padding=1),
            nn.Tanh()  # Final layer with tanh for output in range [-1, 1]
        )

    def forward(self, x):
        x = self.fc_layers(x)
        # print("Shape after conv layers:", x.shape)  # Print shape to confirm
        x = x.view(x.size(0), -1, 1)
        return self.deconv_layers(x)

# Initialize models
discriminator = Discriminator().to(device)
generator = Generator().to(device)

# Loss and Optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy for GAN
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate, betas=(0.5, 0.999))

# Training the GAN
for epoch in range(num_epochs):
    for i, batch in enumerate(train_loader):
        real_data = batch[0].to(device)  # Assuming batch[0] is the actual data tensor

        # Train Discriminator
        optimizer_d.zero_grad()

        # Apply label smoothing
        real_labels = torch.full((real_data.size(0), 1), 0.9).to(device)  # Real labels as 0.9 instead of 1
        fake_labels = torch.full((real_data.size(0), 1), 0.1).to(device)  # Fake labels as 0.1 instead of 0

        # Discriminator on real data
        outputs = discriminator(real_data)
        d_loss_real = criterion(outputs, real_labels)

        # Discriminator on fake data
        noise = torch.randn(real_data.size(0), input_dim).to(device)
        fake_data = generator(noise)
        outputs = discriminator(fake_data.detach())
        d_loss_fake = criterion(outputs, fake_labels)

        # Backprop and optimize discriminator
        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_d.step()

        # Train Generator
        optimizer_g.zero_grad()
        outputs = discriminator(fake_data)
        g_loss = criterion(outputs, real_labels)  # Fool discriminator into thinking fake data is real

        # Backprop and optimize generator
        g_loss.backward()
        optimizer_g.step()

        # Print progress every 10 batches
        if i % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}], d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}")

    # Print progress
    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}")

print("Training complete. Use generator to create synthetic data.")

Epoch [1/3], Batch [1], d_loss: 1.3871, g_loss: 0.7130
Epoch [1/3], Batch [11], d_loss: 1.3815, g_loss: 0.7078
Epoch [1/3], Batch [21], d_loss: 1.3294, g_loss: 0.7065
Epoch [1/3], Batch [31], d_loss: 1.0133, g_loss: 0.8793
Epoch [1/3], Batch [41], d_loss: 0.6719, g_loss: 1.8373
Epoch [1/3], Batch [51], d_loss: 0.6620, g_loss: 2.0101
Epoch [1/3], Batch [61], d_loss: 0.6605, g_loss: 2.0532
Epoch [1/3], Batch [71], d_loss: 0.6601, g_loss: 2.0508
Epoch [2/3], Batch [1], d_loss: 0.6603, g_loss: 2.0131
Epoch [2/3], Batch [11], d_loss: 0.6615, g_loss: 2.0777
Epoch [2/3], Batch [21], d_loss: 0.6593, g_loss: 2.0896
Epoch [2/3], Batch [31], d_loss: 0.6596, g_loss: 2.0778
Epoch [2/3], Batch [41], d_loss: 0.6573, g_loss: 2.0578
Epoch [2/3], Batch [51], d_loss: 0.6583, g_loss: 2.0393
Epoch [2/3], Batch [61], d_loss: 0.6594, g_loss: 2.0784
Epoch [2/3], Batch [71], d_loss: 0.6578, g_loss: 2.0714
Epoch [3/3], Batch [1], d_loss: 0.6563, g_loss: 2.0494
Epoch [3/3], Batch [11], d_loss: 0.6567, g_loss: 2.