In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer
import numpy as np
import json

In [3]:
import os
# Get current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /content


In [4]:
# List files and folders in the current directory
files_and_folders = os.listdir(current_directory)
print("Files and Folders:")
for item in files_and_folders:
    print(item)

Files and Folders:
.config
Project1_Data.json
test.json
drive
2xT.json
train.json
P1.json
sample_data


In [10]:
# Load PhoBERT model and tokenizer
phobert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

# Define your CNN-based classifier model
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        # Define the CNN layers
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=hidden_size, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)
        # Calculate the output size of CNN
        conv_output_size = self._get_conv_output_size(input_size)
        # Define the Fully Connected layer for classification
        self.fc = nn.Linear(conv_output_size, num_classes)

    def forward(self, x):
        # Pass the input through CNN layers
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.pool(x)
        # Flatten the output of CNN
        x = torch.flatten(x, 1)
        # Pass through Fully Connected layer
        x = self.fc(x)
        return x

    def _get_conv_output_size(self, input_size):
        # Calculate the output size of CNN
        input_tensor = torch.randn(1, input_size, 256)  # Assuming input size of 256
        conv_output = self._forward_conv(input_tensor)
        conv_output_size = np.prod(conv_output.size())
        return conv_output_size

    def _forward_conv(self, x):
        # Helper function to get the output size of CNN
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.pool(x)
        return x

# Function to preprocess data and extract embeddings using PhoBERT
def preprocess_data(data):
    embeddings = []
    labels = []
    for item in data:
        question = item["question"]
        text = item["text"]
        # Combine question and text into one string
        combined_text = f"{question} {text}"
        # Tokenize the combined text
        tokenized_text = tokenizer(combined_text, return_tensors="pt", max_length=256, truncation=True, padding="max_length")
        input_ids = tokenized_text["input_ids"]
        # Extract embeddings using PhoBERT
        with torch.no_grad():
            features = phobert(input_ids)[0]
        embeddings.append(features.squeeze(0))
        # Add label
        labels.append(1 if item["label"] else 0)
    return torch.stack(embeddings), torch.tensor(labels)

# Load and preprocess data from test.json
with open("Project1_Data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

embeddings, labels = preprocess_data(data)

# Define your CNN-based classifier model
input_size = embeddings.size(2)  # Dimension of the embeddings
hidden_size = 128  # Size of the hidden layer in CNN
num_classes = 2  # Number of output classes (answerable or not answerable)
model = Classifier(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define DataLoader for training
dataset = TensorDataset(embeddings, labels)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Training loop
epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs.permute(0, 2, 1))  # Permute the input to match CNN input shape
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:    # Print every 100 mini-batches
            print(f"[Epoch {epoch + 1}, Batch {i + 1}] Loss: {running_loss / 100:.3f}")
            running_loss = 0.0

# Evaluate the model
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs.permute(0, 2, 1))
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += inputs.size(0)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples

    return avg_loss, accuracy

def split_data(embeddings, labels, split_ratio=0.8):
    total_samples = len(embeddings)
    split_index = int(total_samples * split_ratio)

    train_embeddings = embeddings[:split_index]
    train_labels = labels[:split_index]

    val_embeddings = embeddings[split_index:]
    val_labels = labels[split_index:]

    return (train_embeddings, train_labels), (val_embeddings, val_labels)

# Split data into train and validation sets
train_data, val_data = split_data(embeddings, labels)

# Define DataLoader for validation
val_dataset = TensorDataset(val_data[0], val_data[1])
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Evaluate the model on validation set
val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)

print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2%}")

Validation Loss: 0.0012, Validation Accuracy: 100.00%
