# Prerequisite

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import urllib.request
import os

glove_file_path = 'glove.6B.300d.txt'

def download_glove_embeddings(url, filename):
        print(f"Downloading GloVe embeddings from {url}...")
        urllib.request.urlretrieve(url, filename)
        print(f"Downloaded GloVe embeddings to {filename}")

def load_glove_embeddings(glove_file_path):
    if not os.path.isfile(glove_file_path):
        # If the file does not exist, download it
        url = "http://nlp.stanford.edu/data/glove.6B.zip"  # URL for GloVe 6B 300d
        zip_file_path = "glove.6B.zip"
        if not os.path.isfile(zip_file_path):
            download_glove_embeddings(url, zip_file_path)
        # Unzip the file
        import zipfile
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(".")
        if not os.path.isfile(glove_file_path):
            raise FileNotFoundError(f"Expected file {glove_file_path} not found after extraction.")

load_glove_embeddings(glove_file_path)

# Exploring

In [None]:
from preprocess import load_data

df = load_data("../data/traindata.csv")
devdf = load_data("../data/devdata.csv")

print(len(df))
print(len(devdf))

print(df.head())
print(devdf.head())

In [None]:
df["aspect_category"].value_counts() / len(df)

In [None]:
df["polarity"].value_counts() / len(df)

# Preprocessing

In [None]:
from preprocess import remove_stopwords, load_data, simple_tokenize
from stopwords import STOPWORDS
import random

df = load_data("../data/traindata.csv")

rand_i = random.randint(0, len(df)-1)
# rand_i = 0
# rand_i = 303
# rand_i = 1111

print("index", rand_i)

print(df["aspect_category"][rand_i])
target_tok = simple_tokenize(df["aspect_category"][rand_i].lower())
print(len(target_tok), target_tok)

print(df["aspect_term"][rand_i])
target_tok = remove_stopwords(simple_tokenize(df["aspect_term"][rand_i].lower()), STOPWORDS)
print(len(target_tok), target_tok)

print(df["sentence"][rand_i])
sentence_tok = remove_stopwords(simple_tokenize(df["sentence"][rand_i].lower()), STOPWORDS)
print(len(sentence_tok), sentence_tok)


### Padding

In [None]:
from preprocess import load_glove_embeddings

glove_file_path = 'glove.6B.300d.txt'
word_to_idx, embedding_matrix = load_glove_embeddings(glove_file_path)

In [None]:
from preprocess import remove_stopwords, load_data, simple_tokenize, pad_sequence
import random

max_len = 50
STOPWORDS = []

df = load_data("../data/traindata.csv")
column = "aspect_term"

rand_i = random.randint(0, len(df)-1)
# rand_i = 0
# rand_i = 303
# rand_i = 1111

tokens = remove_stopwords(simple_tokenize(df[column][rand_i].lower()), STOPWORDS)
indices = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens]
indices = pad_sequence(indices, word_to_idx, max_len) 

print("index", rand_i)
print(df[column][rand_i])
print(tokens)
print(indices)

In [None]:
### Left and Right sentence

In [None]:
from preprocess import remove_stopwords, load_data, simple_tokenize
from stopwords import STOPWORDS
import random

df = load_data("../data/traindata.csv")

rand_i = random.randint(0, len(df)-1)
# rand_i = 0
# rand_i = 303
# rand_i = 1111

print("index", rand_i)

# start_index = df[offset]
# print()

start_index, end_index = df["offset"][rand_i].split(":")
start_index, end_index = int(start_index), int(end_index)

sentence = df["sentence"][rand_i]
left_sentence = sentence[:end_index]
right_sentence = sentence[start_index:]

print(df["aspect_term"][rand_i])
print(sentence)
print(left_sentence)
print(right_sentence)

print(simple_tokenize(sentence))
print(simple_tokenize(left_sentence))
print(simple_tokenize(right_sentence))

In [None]:
from preprocess import remove_stopwords, load_data, simple_tokenize
from stopwords import STOPWORDS
# STOPWORDS = []

# filepath = "../data/traindata.csv"
filepath = "../data/devdata.csv"

column = "sentence"

df = load_data(filepath)
max_tokens = []
max_len = 0
max_i = -1

for i, elem in enumerate(df[column]):
    tokens = remove_stopwords(simple_tokenize(elem.lower()), STOPWORDS)
    if len(tokens) > max_len:
        max_tokens = tokens
        max_len = len(tokens)
        max_i = i

print(df[column][max_i])
print(max_tokens)
print(max_len)

In [None]:
l = list(set(df["aspect_term"]))
longest_str = max(l, key=len)
index_of_longest = list(df["aspect_term"]).index(longest_str)

print(len(longest_str))
print(longest_str)
print("index", index_of_longest)
print(sum(len(s) > 50 for s in l))

In [None]:
l = list(set(df["sentence"]))
longest_str = max(l, key=len)
index_of_longest = list(df["sentence"]).index(longest_str)

print(len(longest_str))
print(longest_str)
print("index", index_of_longest)
print(sum(len(s) > 50 for s in l))

# Dataset

### BiLSTM Attention Dataset

In [None]:
from bilstm_attention import BiLSTM_Attention_Dataset
from preprocess import load_glove_embeddings

glove_file_path = 'glove.6B.300d.txt'
word_to_idx, embedding_matrix = load_glove_embeddings(glove_file_path)
max_len = 50

train_dataset = BiLSTM_Attention_Dataset("../data/traindata.csv", word_to_idx, max_len)
dev_dataset = BiLSTM_Attention_Dataset("../data/devdata.csv", word_to_idx, max_len)

In [None]:
import random

rand_i = random.randint(0, len(train_dataset)-1)
# rand_i = 0

train_dataset[rand_i]

In [None]:
from torch.utils.data import DataLoader

train_data_loader = DataLoader(train_dataset, batch_size=64)
dev_data_loader = DataLoader(dev_dataset, batch_size=64)

for i, batch in enumerate(train_data_loader):
    print(batch.size())

In [None]:
    # def __getitem__(self, index):
        
    #     # Get the index of the aspect term in the sentence
    #     aspect_term_indices_in_sentence = [i for i, token in enumerate(sentence_tokens) if token in aspect_term_tokens]
    #     if not aspect_term_indices_in_sentence:
    #         aspect_term_indices_in_sentence = [len(sentence_tokens) // 2]
        
    #     aspect_term_position = aspect_term_indices_in_sentence[0]

    #     # Determine the start and end indices of the context window
    #     start_index = max(0, aspect_term_position - self.context_window)
    #     end_index = min(len(sentence_tokens), aspect_term_position + self.context_window + 1)

    #     # Extract the context window
    #     context_window_tokens = sentence_tokens[start_index:end_index]
    #     context_window_indices = [self.word_to_idx.get(token, self.word_to_idx["<UNK>"]) for token in context_window_tokens]

    #     # Pad or truncate the context window to the max_len
    #     context_window_indices = self.pad_sequence(context_window_indices, self.max_len)

    #     # Pad or truncate the aspect term indices
    #     aspect_term_indices = self.pad_sequence(aspect_term_indices, self.max_len)

    #     return {
    #         "aspect_term_indices": torch.tensor(aspect_term_indices, dtype=torch.long),
    #         "sentence_indices": torch.tensor(context_window_indices, dtype=torch.long),
    #         "labels": torch.tensor(polarity, dtype=torch.long),
    #     }

    # def simple_tokenize(self, text):
    #     return re.findall(r"\b\w+\b", text)

    # def pad_sequence(self, seq, max_len):
    #     if len(seq) < max_len:
    #         seq += [self.word_to_idx["<PAD>"]] * (max_len - len(seq))
    #     else:
    #         seq = seq[:max_len]
    #     return seq


### TD LSTM Dataset

In [None]:
from tdlstm import TD_LSTM_Dataset
from preprocess import load_data, load_glove_embeddings

dataset = TD_LSTM_Dataset

train_filename = "../data/traindata.csv"
glove_file_path = 'glove.6B.300d.txt'
word_to_idx, embedding_matrix = load_glove_embeddings(glove_file_path)
max_len = 50

df = load_data(train_filename)
dataset = dataset(train_filename, word_to_idx, max_len)

In [None]:
isinstance(dataset, TD_LSTM_Dataset)

In [None]:
import random

rand_i = random.randint(0, len(dataset)-1)

print(df["aspect_term"][rand_i])
print(df["sentence"][rand_i])
dataset[rand_i]

### ATAE LSTM Dataset

In [None]:
from atae_lstm import ATAE_LSTM_Dataset
from preprocess import load_data, load_glove_embeddings
from torch.utils.data import DataLoader

dataset = ATAE_LSTM_Dataset

train_filename = "../data/traindata.csv"
dev_filename = "../data/devdata.csv"

glove_file_path = 'glove.6B.300d.txt'
word_to_idx, embedding_matrix = load_glove_embeddings(glove_file_path)
max_len = 50

batch_size = 32

df = load_data(train_filename)

train_dataset = dataset(train_filename, word_to_idx, max_len)
dev_dataset = dataset(dev_filename, word_to_idx, max_len)

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_data_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)

In [None]:
import random

rand_i = random.randint(0, len(dataset)-1)

print(df["aspect_term"][rand_i])
print(df["sentence"][rand_i])
dataset[rand_i]

In [None]:
import torch
import torch.nn as nn

aspect_indices = dataset[rand_i]["aspect_indices"]
sentence_indices = dataset[rand_i]["sentence_indices"]
pad_index = 0

embedding = nn.Embedding.from_pretrained(
    torch.tensor(embedding_matrix, dtype=torch.float32)
)

sentence_embeddings = embedding(sentence_indices).unsqueeze(0)
sentence_embeddings.size()
sentence_embeddings

aspect_embeddings = embedding(aspect_indices).unsqueeze(0)
aspect_embeddings.size()
aspect_embeddings

pad_mask = (aspect_indices != pad_index).float().unsqueeze(0)
pad_mask.size() # (batch_size, aspect_len)
pad_mask

masked_aspect_embeddings = aspect_embeddings * pad_mask.unsqueeze(-1)
masked_aspect_embeddings.size()
masked_aspect_embeddings

sum_aspect_embeddings = masked_aspect_embeddings.sum(dim=1)
sum_aspect_embeddings.size()
sum_aspect_embeddings

num_non_padding_tokens = pad_mask.sum(dim=1)
num_non_padding_tokens.size()
num_non_padding_tokens

mean_aspect_embeddings = sum_aspect_embeddings / num_non_padding_tokens.unsqueeze(-1)
mean_aspect_embeddings.size()
mean_aspect_embeddings

repeated_aspect_embeddings = mean_aspect_embeddings.unsqueeze(1).repeat(
    1, sentence_embeddings.size(1), 1
)
repeated_aspect_embeddings.size()
repeated_aspect_embeddings

x = torch.cat((sentence_embeddings, repeated_aspect_embeddings), 2)
x.size()
x

# Model

In [None]:
import torch
import torch.nn as nn
from preprocess import load_glove_embeddings

glove_file_path = 'glove.6B.300d.txt'
word_to_idx, embedding_matrix = load_glove_embeddings(glove_file_path)
max_len = 50


embedding = nn.Embedding.from_pretrained(
    torch.tensor(embedding_matrix, dtype=torch.float32)
)

In [None]:
word_to_idx["<PAD>"]

In [None]:
embedding_matrix.shape[1]

In [None]:
embedding.embedding_dim

In [None]:
torch.zeros(1, 32, 128).size()

### ATAE LSTM

In [None]:
from atae_lstm import ATAE_LSTM_Dataset, ATAE_LSTM_Model
from preprocess import load_data, load_glove_embeddings
from torch.utils.data import DataLoader

dataset = ATAE_LSTM_Dataset

train_filename = "../data/traindata.csv"
dev_filename = "../data/devdata.csv"

glove_file_path = 'glove.6B.300d.txt'
word_to_idx, embedding_matrix = load_glove_embeddings(glove_file_path)
max_len = 50

batch_size = 32

df = load_data(train_filename)

train_dataset = dataset(train_filename, word_to_idx, max_len)
dev_dataset = dataset(dev_filename, word_to_idx, max_len)

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_data_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)

model = ATAE_LSTM_Model(
    embedding_matrix, 
    128,
    3, 
    False,
    1,
    0.7, 
    50)

print(model)

In [None]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)

def get_batch(data_loader):
    for batch in data_loader:
        return batch

batch = get_batch(train_data_loader)

pad_index = word_to_idx["<PAD>"]

sentence_indices = batch["sentence_indices"].to(device)
aspect_indices = batch["aspect_indices"].to(device)
labels = batch["labels"].to(device)

bs = sentence_indices.size(0)

h0, c0 = model.init_prev_hidden(bs)
h0 = h0.to(device)
c0 = c0.to(device)

In [None]:
labels.size()

In [None]:
output, H = model(sentence_indices, aspect_indices, pad_index, (h0, c0))
print(output.size())
print(output)
print(labels)
print(H.size())


In [None]:
import torch.nn as nn

# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()

print(input)
print(target)
print(output)

# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
output.backward()

print("\n")
print(input)
print(target)
print(output)