In [1]:
import os
import json
import torch
import gensim
import numpy as np
import transformers
from typing import Callable
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

In [3]:
model = "Embeddings/GoogleNews-vectors-negative300.bin.gz"     # Word2Vec
embedding = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)

In [39]:
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

target = get_tmpfile("temp.txt")
glove2word2vec("Embeddings/glove.42B.300d/glove.42B.300d.txt", target)

model = gensim.models.KeyedVectors.load_word2vec_format(target)

model.save_word2vec_format("Embeddings/glove.42B.300d.bin.gz", binary=True)

  glove2word2vec("Embeddings/glove.42B.300d/glove.42B.300d.txt", target)


In [5]:
model = "Embeddings/glove.42B.300d.bin.gz"                       # GloVe
embedding = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)

In [7]:
model = "Embeddings/cc.en.300.bin.gz"                          # FastText
embedding = gensim.models.fasttext.load_facebook_model(model)

In [15]:
def load_dataset(dataset: int, embeddings: str):
    """
    Loads the given dataset and returns the train, test, and validation
    sets with the corresponding labels.
    :params:
        - dataset: The dataset to load. Must be 1 or 2.
        - embeddings: The word-embeddings to use. Must be "Word2Vec", "GloVe", or "FastText" (case-insensitive).
    """

    embeddings_err = "Invalid embeddings. Must be 'Word2Vec', 'GloVe', or 'FastText'."
    dataset_err = "Invalid dataset number. Must be 1 or 2."
    embeddings = embeddings.casefold()

    assert dataset in [1, 2], dataset_err
    assert embeddings in ["word2vec", "glove", "fasttext"], embeddings_err

    path = rf"Assignment-2/Datasets/processed/dataset_{dataset}"
    train_path, test_path, val_path = [os.path.join(path, f"{x}.json") for x in ["train", "test", "val"]]

    with open(train_path) as train, open(test_path) as test, open(val_path) as val:
        train_data = json.load(train)
        test_data = json.load(test)
        val_data = json.load(val)
    print("HERE-1")

    if embeddings == "word2vec":
        model = "Assignment-2/Embeddings/GoogleNews-vectors-negative300.bin.gz"
        embedding = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)
    elif embeddings == "glove":
        model = "Assignment-2/Embeddings/glove.42B.300d.bin.gz"
        embedding = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)
    else:
        model = "Assignment-2/Embeddings/cc.en.300.bin.gz"
        embedding = gensim.models.fasttext.load_facebook_model(model)
    print("HERE-2")

    LABELS = set()
    for key, data in train_data.items():
        LABELS.update(data["labels"])
    encoder = LabelEncoder()
    encoder.fit(sorted(LABELS))
    print("HERE-3")

    TRAIN_DATA, TRAIN_LABELS = [], []
    for _, data in train_data.items():
        TRAIN_DATA.append(data["text"].split())
        TRAIN_LABELS.append(data["labels"])
    print("HERE-4")

    TEST_DATA, TEST_LABELS = [], []
    for _, data in test_data.items():
        TEST_DATA.append(data["text"].split())
        TEST_LABELS.append(data["labels"])
    print("HERE-5")

    VAL_DATA, VAL_LABELS = [], []
    for _, data in val_data.items():
        VAL_DATA.append(data["text"].split())
        VAL_LABELS.append(data["labels"])
    print("HERE-6")

    # train_set = CustomDataset(TRAIN_DATA, TRAIN_LABELS, encoder, vectorizor)
    # test_set = CustomDataset(TEST_DATA, TEST_LABELS, encoder, vectorizor)
    # val_set = CustomDataset(VAL_DATA, VAL_LABELS, encoder, vectorizor)

    # return train_set, test_set, val_set

    return (TRAIN_DATA, TRAIN_LABELS), (TEST_DATA, TEST_LABELS), (VAL_DATA, VAL_LABELS), encoder, embedding

In [6]:
os.chdir("..")

In [16]:

((train_data, train_labels),
 (test_data, test_labels),
 (val_data, val_labels),
 encoder, embedding, vectorizor) = load_dataset(1, "WORD2VEC")

try:
    embedding_matrix = torch.FloatTensor(embedding.vectors)
    vectorizor = embedding.get_vector
except AttributeError:
    embedding_matrix = torch.FloatTensor(embedding.wv.vectors)
    vectorizor = lambda x: embedding.wv[x]

embedding_matrix = torch.cat([embedding_matrix, torch.zeros(1, embedding_matrix.shape[1])])
embedding.key_to_index["<UNK>"] = len(embedding.key_to_index)
embedding.index_to_key.append("<UNK>")

HERE-1
HERE-2
HERE-3
HERE-4
HERE-5
HERE-6


In [110]:
EMBEDING_LAYER = torch.nn.Embedding.from_pretrained(embedding_matrix)
word_embedding = EMBEDING_LAYER(torch.tensor([embedding.key_to_index.get(word) for word in "the quick brown fox".split()]))
word_embedding.shape

torch.Size([4, 300])

In [103]:
subset = train_data[:5]
for sentence in subset:
    print(sentence)

['He', 'was', 'also', 'asked', 'whether', 'Agya', '<span', 'class="hidden_text"', 'id="span_5">', 'CRA', 'No.326-DB', 'of', '1998', '6</span>', 'Kaur,', 'mother-in-law', 'of', 'the', 'deceased', 'lived', 'separately', 'from', 'Tarlochan', 'Singh.']
['5.2', 'CW3', 'Mr', 'Vijay', 'Mishra', ',', 'Deputy', 'Manager,', 'HDFC', 'Bank,', 'Noida,', 'UP', 'has', 'deposed', 'that', 'complainant', 'had', 'a', 'current', 'account', 'with', 'HDFC', 'Bank', 'in', 'the', 'year', '2004\xad2005.']
['You', 'are', 'hereby', 'asked', 'not', 'to', 'carry', 'out', 'any', 'construction', 'work', 'of', 'the', 'said', 'building', 'hereafter', 'since', 'the', 'agreement', 'has', 'been', 'terminated".']
['After', 'all', 'the', 'steps', 'at', 'the', 'stage', 'of', 'investigation', 'has', 'to', 'be', 'reported', 'before', 'the', 'Court', 'and', 'the', 'order', 'passed', 'thereon', 'is', 'obviously', 'judicial', 'order', 'and', 'this', 'takes', 'clear', 'note', 'of', 'the', 'agony', 'of', 'the', 'learned', 'Counsel

In [104]:
subset = [[embedding.key_to_index.get(word, len(embedding.key_to_index)-1) for word in sentence] for sentence in subset]
for sentence in subset:
    print(len(sentence), sentence)

24 [57, 10, 53, 438, 369, 817011, 3000000, 3000000, 3000000, 30909, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 11, 9895, 1958, 8019, 17, 1219623, 3000000]
27 [3000000, 863288, 602, 30124, 33742, 3000000, 3562, 3000000, 38263, 3000000, 3000000, 13860, 24, 20696, 3, 21322, 35, 3000000, 403, 1201, 8, 38263, 669, 1, 11, 36, 3000000]
22 [228, 19, 29224, 438, 13, 3000000, 1635, 49, 101, 984, 141, 3000000, 11, 9, 473, 65530, 140, 11, 729, 24, 42, 3000000]
37 [361, 52, 11, 1830, 12, 11, 943, 3000000, 915, 24, 3000000, 16, 343, 99, 11, 1557, 3000000, 11, 555, 1126, 68081, 4, 2633, 5481, 555, 3000000, 28, 920, 645, 1734, 3000000, 11, 20599, 3000000, 11, 1638, 3000000]
8 [3000000, 2165064, 4, 11, 2520, 1722, 3000000, 3000000]


In [105]:
padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(sentence) for sentence in subset], batch_first=True)
for row in padded:
    x = (list(i.item() for i in row))
    print(len(x), x)

37 [57, 10, 53, 438, 369, 817011, 3000000, 3000000, 3000000, 30909, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 11, 9895, 1958, 8019, 17, 1219623, 3000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
37 [3000000, 863288, 602, 30124, 33742, 3000000, 3562, 3000000, 38263, 3000000, 3000000, 13860, 24, 20696, 3, 21322, 35, 3000000, 403, 1201, 8, 38263, 669, 1, 11, 36, 3000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
37 [228, 19, 29224, 438, 13, 3000000, 1635, 49, 101, 984, 141, 3000000, 11, 9, 473, 65530, 140, 11, 729, 24, 42, 3000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
37 [361, 52, 11, 1830, 12, 11, 943, 3000000, 915, 24, 3000000, 16, 343, 99, 11, 1557, 3000000, 11, 555, 1126, 68081, 4, 2633, 5481, 555, 3000000, 28, 920, 645, 1734, 3000000, 11, 20599, 3000000, 11, 1638, 3000000]
37 [3000000, 2165064, 4, 11, 2520, 1722, 3000000, 3000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [114]:
EMBEDING_LAYER(padded).shape

torch.Size([5, 37, 300])

In [122]:
class CustomDataset(Dataset):
    """
    A custom dataset class for dataset_1 and dataset_2.
    :attrs:
        - data: The data of the dataset.
        - targets: The targets of the dataset.
        - encoder: The (already fitted) label-encoder for the targets.
    """

    data: list[list[str]]
    targets: list[list[str]]
    encoder: LabelEncoder
    embedding: "Word-Embedding"

    def __init__(self, data, targets, encoder, embedding):
        super(CustomDataset, self).__init__()
        self.data = data
        self.targets = targets
        self.encoder = encoder
        self.embedding = embedding
        self._convert_to_index()
        self._pad()

    def __len__(self) -> int:
        """
        Returns the length of the dataset.
        """
        return len(self.data)

    def __getitem__(self, index: int) -> tuple[torch.Tensor]:
        """
        Returns the data and target at the given index.
        Converts each sentence into a tensor of word indices using
        the chosen word-embeddings.
        """
        return self.data[index], self.targets[index]

    def _convert_to_index(self) -> None:
        """
        Converts all sentences to their corresponding word-indices.
        """
        unk = len(self.embedding.key_to_index)-1
        self.data = [
            [self.embedding.key_to_index.get(word, unk) for word in sentence]
            for sentence in self.data
        ]

    def _pad(self) -> None:
        """
        Pads all sentences to the maximum length.
        """
        self.targets = [torch.tensor(self.encoder.transform(label)) for label in self.targets]
        max_len = max(len(sentence) for sentence in self.data)
        self.data = torch.nn.utils.rnn.pad_sequence([torch.tensor(sentence) for sentence in self.data], batch_first=True)
        self.targets = torch.nn.utils.rnn.pad_sequence(self.targets, batch_first=True, padding_value=-1)


train_set = CustomDataset(train_data, train_labels, encoder, embedding)
test_set = CustomDataset(test_data, test_labels, encoder, embedding)
val_set = CustomDataset(val_data, val_labels, encoder, embedding)

In [123]:
train_set[0]

(tensor([     57,      10,      53,     438,     369,  817011, 3000000, 3000000,
         3000000,   30909, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000,
         3000000,      11,    9895,    1958,    8019,      17, 1219623, 3000000,
               0,       0,       0,       0,       0,       0,       0,       0,
               0,       0,       0,       0,       0,       0,       0,       0,
               0,       0,       0,       0,       0,       0,       0,       0,
               0,       0,       0,       0,       0,       0,       0,       0,
               0,       0,       0,       0,       0,       0,       0,       0,
               0,       0,       0,       0,       0,       0]),
 tensor([26, 26, 26, 26, 26,  6, 26, 26, 26, 26, 26, 26, 26, 26,  6, 26, 26, 26,
         26, 26, 26, 26,  6, 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1,

In [124]:
# def collate_fn(batch):
#     # Sort batch based on the length of sequences
#     batch.sort(key=lambda x: len(x[0]), reverse=True)
#     # Extract sentences and labels from the batch
#     sentences, labels = zip(*batch)
#     # Pad sequences to the length of the longest sequence in the batch
#     padded_sentences = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True, padding_value=0)
#     # Create tensor for labels
#     label_pad = encoder.transform(["O"])[0]
#     padded_labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=label_pad)
#     # Create mask tensor to mask out padded values
#     mask = (padded_sentences != 0).float()
#     return padded_sentences, padded_labels, mask

train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=32, shuffle=True)

In [128]:
for data, labels in train_loader:
    print(data.shape, labels.shape)
    break

torch.Size([32, 70]) torch.Size([32, 70])


In [154]:
class RNN(torch.nn.Module):
    # input_size: batch x seq_len x input_size == 32 x seq_len x 300
    # hidden_size: hidden_size hyperparam
    # output_size: num_classes = 27
    # num_layers: num_layers hyperparam

    # ADD embedding layer and time distributed layer
    #Embedding layer,LSTM layer,Time stribution layer

    def __init__(self, input_size, hidden_size, num_layers, output_size, embedding_matrix):
        super(RNN, self).__init__()
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_matrix)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=2)

    def forward(self, x):
        x = self.embedding(x)
        hidden = torch.zeros(self.num_layers, x.shape[0], self.hidden_size)
        output, _ = self.rnn(x, hidden)
        output = self.fc(output)
        return self.softmax(output)


model = RNN(300, 128, 2, 27, embedding_matrix)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

for epoch in range(10):
    for data, labels in train_loader:
        output = model(data)
        print(output.shape, labels.shape)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} Loss: {loss.item()}")

torch.Size([32, 70, 27]) torch.Size([32, 70])


RuntimeError: Expected target size [32, 27], got [32, 70]

In [284]:
# Calculate true positives and negatives, and false positives and negatives

model.eval()
confusion_matrix = torch.zeros(27, 27)
with torch.no_grad():
    for p, (data, labels, mask) in enumerate(test_loader):
        output = model(data)
        pred = torch.argmax(output, dim=2)
        for i in range(labels.shape[0]):
            for j in range(labels.shape[1]):
                confusion_matrix[labels[i][j].long(), pred[i][j].long()] += 1
    print(p, "of", len(test_loader))

# Calculate accuracy, precision, recall, and F1-score

TP = torch.diag(confusion_matrix)
FP = confusion_matrix.sum(dim=0) - TP
FN = confusion_matrix.sum(dim=1) - TP
TN = confusion_matrix.sum() - (TP + FP + FN)

accuracy = (TP + TN) / (TP + FP + FN + TN)
precision = TP / (TP + FP + 1e-6)
recall = TP / (TP + FN + 1e-6)
f1 = 2 * (precision * recall) / (precision + recall + 1e-6)

print(TP.mean(), FP.mean(), FN.mean(), TN.mean())

print(f"Accuracy: {accuracy.mean().item():.4f}")
print(f"Precision: {precision.mean().item():.4f}")
print(f"Recall: {recall.mean().item():.4f}")
print(f"F1-Score: {f1.mean().item():.4f}")

29 of 30
tensor(2089.1111) tensor(218.0370) tensor(218.0370) tensor(59767.8164)
Accuracy: 0.9930
Precision: 0.0335
Recall: 0.0370
F1-Score: 0.0352


In [1]:
import torch
import torch.nn as nn
from torchtext.vocab import GloVe

# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=300)

# Define your vocabulary size and embedding dimension
vocab_size = len(glove.itos)
embedding_dim = glove.vectors.shape[1]

# Create the embedding layer and initialize with pre-trained embeddings
embedding = nn.Embedding.from_pretrained(glove.vectors)

# Optionally, freeze the embedding layer
embedding.weight.requires_grad = False

# Example: get the index of a word and look up its embedding
word_index = glove.stoi['word']
word_embedding = embedding(torch.LongTensor([word_index]))

print(word_embedding)


ModuleNotFoundError: No module named 'torchtext'