In [1]:
import os
os.environ["KERAS_BACKEND"] = "torch"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import string

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import re
import keras


In [2]:
vocab = (
    list(string.ascii_letters)
    + list(string.digits)
    + list(string.punctuation)
    + [" ", "[UNK]", "\u2705", "\u274C"]
)

In [3]:
def tokenize(vocab: list[str], input: str) -> torch.Tensor:
    idxs = []
    for char in input:
        try:
            idx = vocab.index(char)
        except ValueError:
            idx = vocab.index("[UNK]")
        idxs.append(idx)
    return torch.as_tensor(idxs)


def detokenize(vocab: list[str], input: torch.Tensor) -> str:
    output = ""
    for idx in input:
        output += vocab[idx]
    return output


class MovieReviewDataset(Dataset):
    def __init__(self, review_dir: str, vocab: list[str]) -> None:
        self.dir = review_dir
        self.vocab = vocab
        self.files = [item for item in os.listdir(review_dir)]

    def __len__(self) -> int:
        return len(self.files)

    def __getitem__(self, index: int) -> str:
        with open(self.dir + self.files[index], "r") as file:
            string = file.read()
            # Remove extraneous HTML tags
            string = re.sub(r"<[^>]*>", "", string)
            # Add beginning and ending indicator characters
            # These are 2705: ✅, 274C: ❌
            string = "\u2705" + string + "\u274C"
        tokens = tokenize(self.vocab, string)
        return tokens[:-1], tokens[1:]

In [4]:
def collate_fn(batch):
    inputs = [x[0] for x in batch]
    outputs = [x[1] for x in batch]
    return nn.utils.rnn.pad_sequence(
        inputs, padding_value=999
    ), nn.utils.rnn.pad_sequence(outputs, padding_value=999)


dataset = MovieReviewDataset("./data/movie_reviews/train/unsup/", vocab=vocab)
dataloader = DataLoader(
    dataset, batch_size=64, shuffle=True, collate_fn=collate_fn
)

In [5]:
input = keras.Input(shape=(None,))

embed = keras.layers.Embedding(len(vocab), len(vocab))
mask = keras.layers.Masking(mask_value=999)
lstm = keras.layers.LSTM(256, return_sequences=True)
dense = keras.layers.Dense(len(vocab))

output = dense(lstm(mask(embed(input))))

model = keras.Model(inputs=input, outputs=output)

  return torch.linalg.qr(x, mode=mode)


In [6]:
model.summary()

In [7]:
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.AdamW(), metrics=["accuracy"])

In [8]:
history = model.fit(dataloader, epochs=10)

Epoch 1/10
[1m 10/782[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:25:36[0m 11s/step - accuracy: 0.0302 - loss: 1.1945