In [42]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sentence_transformers import SentenceTransformer

In [40]:
df = pd.read_json('../dat/egpaugmented_6x100.json')

In [41]:
sentences = [df['augmented_examples'].iloc[0][0]]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = model.encode(sentences)

In [56]:
sentences = []
labels = []

for _, row in df[5:6].iterrows():
    positive_examples = row['augmented_examples']
    negative_examples = row['augmented_negative_examples']

    for sentence in positive_examples:
        sentences.append(sentence)
        labels.append(1)  # Label for positive examples

    for sentence in negative_examples:
        sentences.append(sentence)
        labels.append(0)  # Label for negative examples

# Generate embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = model.encode(sentences)


Create a data loader.

In [57]:
class SentenceDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Creating the dataset and dataloader
dataset = SentenceDataset(embeddings, labels)
# Total size of the dataset
total_size = len(dataset)

# Specify the size of the training and validation sets
train_size = int(0.8 * total_size)  # 80% for training
val_size = total_size - train_size  # Remaining 20% for validation

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)


Let's define our network.

In [58]:
class FeedforwardNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FeedforwardNN, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_dim, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        hidden = self.relu(self.fc1(x))
        output = self.sigmoid(self.fc2(hidden))
        return output

# Initialize the network
input_dim = embeddings.shape[1]  # Size of the sentence embeddings
hidden_dim = 128
model = FeedforwardNN(input_dim, hidden_dim)

Now we can train the network with the sentence embeddings.

In [59]:
# Loss and Optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training Loop
for epoch in range(100):  # Number of epochs
    model.train()  # Set the model to training mode
    for inputs, targets in train_dataloader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets.float())

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_steps = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in val_dataloader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets.float())
            val_loss += loss.item()
            val_steps += 1

            # Calculate accuracy
            predicted = outputs.round()  # Assuming a binary classification
            total += targets.size(0)
            correct += (predicted.squeeze() == targets).sum().item()

    avg_val_loss = val_loss / val_steps
    accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/10], Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}')

Epoch [1/10], Loss: 0.6959, Validation Loss: 0.6941, Accuracy: 0.5000
Epoch [2/10], Loss: 0.6912, Validation Loss: 0.6916, Accuracy: 0.5250
Epoch [3/10], Loss: 0.6882, Validation Loss: 0.6899, Accuracy: 0.5000
Epoch [4/10], Loss: 0.6834, Validation Loss: 0.6868, Accuracy: 0.5000
Epoch [5/10], Loss: 0.6810, Validation Loss: 0.6847, Accuracy: 0.5250
Epoch [6/10], Loss: 0.6805, Validation Loss: 0.6830, Accuracy: 0.5250
Epoch [7/10], Loss: 0.6761, Validation Loss: 0.6795, Accuracy: 0.5250
Epoch [8/10], Loss: 0.6688, Validation Loss: 0.6745, Accuracy: 0.5000
Epoch [9/10], Loss: 0.6572, Validation Loss: 0.6680, Accuracy: 0.5250
Epoch [10/10], Loss: 0.6538, Validation Loss: 0.6644, Accuracy: 0.5000
Epoch [11/10], Loss: 0.6432, Validation Loss: 0.6580, Accuracy: 0.5500
Epoch [12/10], Loss: 0.6421, Validation Loss: 0.6546, Accuracy: 0.5250
Epoch [13/10], Loss: 0.6343, Validation Loss: 0.6487, Accuracy: 0.5250
Epoch [14/10], Loss: 0.6238, Validation Loss: 0.6421, Accuracy: 0.5500
Epoch [15/10], 