# 2. Baseline Metrics

In [3]:
!pip install torch transformers sentencepiece

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

Running on: cuda


In [4]:
# Define dataset path
dataset_dir = "/content/meta-semantic-research/data/COGS"

# Load datasets
train_df = pd.read_csv(os.path.join(dataset_dir, "train.tsv"), sep="\t", names=["Sentence", "LogicalForm"])
dev_df = pd.read_csv(os.path.join(dataset_dir, "dev.tsv"), sep="\t", names=["Sentence", "LogicalForm"])
test_df = pd.read_csv(os.path.join(dataset_dir, "test.tsv"), sep="\t", names=["Sentence", "LogicalForm"])
gen_df = pd.read_csv(os.path.join(dataset_dir, "gen.tsv"), sep="\t", names=["Sentence", "LogicalForm"])

# Display dataset sample
train_df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/meta-semantic-research/data/COGS/train.tsv'

In [None]:
from transformers import AutoTokenizer

# Load a tokenizer (T5 example)
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenize example sentence
example_sentence = "The dog chased the cat."
tokens = tokenizer(example_sentence, return_tensors="pt")

print("Tokenized Output:", tokens.input_ids)

In [None]:
from collections import Counter

def build_vocab(sentences):
    vocab_counter = Counter()
    for sentence in sentences:
        vocab_counter.update(sentence.split())

    vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.most_common(), start=4)}
    vocab["<unk>"], vocab["<pad>"], vocab["<sos>"], vocab["<eos>"] = 0, 1, 2, 3
    return vocab

# Example usage
sentences = train_df["Sentence"].tolist()
vocab = build_vocab(sentences)
print("Vocabulary:", vocab)



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class COGSDataset(Dataset):
    def __init__(self, df, tokenizer, vocab):
        self.data = df
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx, 0]
        logical_form = self.data.iloc[idx, 1]

        sentence_tokens = [self.vocab.get(token, self.vocab["<unk>"]) for token in sentence.split()]
        logical_form_tokens = [self.vocab.get(token, self.vocab["<unk>"]) for token in logical_form.split()]

        return torch.tensor(sentence_tokens), torch.tensor(logical_form_tokens)

# Create DataLoaders
batch_size = 8
train_dataset = COGSDataset(train_df, tokenizer, vocab)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [None]:
import torch.nn as nn
import torch.optim as optim

class Seq2SeqLSTM(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim).to(device)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True).to(device)
        self.fc = nn.Linear(hidden_dim, output_dim).to(device)
        self.softmax = nn.LogSoftmax(dim=1).to(device)

    def forward(self, x):
        x = x.to(device)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        output = self.fc(output[:, -1, :])
        return self.softmax(output)

# Initialize LSTM Model
input_dim = len(vocab)
output_dim = len(vocab)
lstm_model = Seq2SeqLSTM(input_dim, output_dim, emb_dim=256, hidden_dim=512, num_layers=2, dropout=0.5).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)



In [None]:
def train_lstm(model, dataloader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for sentences, logical_forms in dataloader:
            sentences, logical_forms = sentences.to(device), logical_forms.to(device)
            optimizer.zero_grad()
            output = model(sentences)
            loss = criterion(output, logical_forms)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

train_lstm(lstm_model, train_loader, criterion, optimizer, num_epochs=5)



In [None]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_lstm(model, dataloader, dataset_name):
    model.eval()
    predictions, targets = [], []

    with torch.no_grad():
        for sentences, logical_forms in dataloader:
            sentences, logical_forms = sentences.to(device), logical_forms.to(device)
            output = model(sentences)
            preds = output.argmax(dim=1)
            predictions.extend(preds.tolist())
            targets.extend(logical_forms.tolist())

    acc = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions, average="weighted")
    print(f"{dataset_name} Set - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

evaluate_lstm(lstm_model, train_loader, "Train")
evaluate_lstm(lstm_model, gen_loader, "Gen")
