In [378]:
import re
import json
from collections import defaultdict, Counter
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence
from torchinfo import summary
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [250]:
df = pd.read_excel("./cleaned_zappos_men.xlsx")
df.head()

Unnamed: 0,brand,model,price,reviews,Ratings
0,Hey Dude,Wally Linen Natural,$64.99,Bought these for my son and he LOVES them!!! H...,5
1,Hey Dude,Wally Linen Natural,$64.99,I get lots of compliments for these .,5
2,Hey Dude,Wally Linen Natural,$64.99,I love my Dude's!,5
3,Hey Dude,Wally Linen Natural,$64.99,Love these!!,5
4,Hey Dude,Wally Linen Natural,$64.99,Probably my favorite,5


In [251]:
df["Ratings"].value_counts()

Ratings
5    2127
4     254
3     215
1     184
2     181
Name: count, dtype: int64

In [254]:
# Remove off by 1 error, ratings now start from 0 to 4
if not df["Ratings"].min() == 0:
    df["Ratings"] = df["Ratings"] - 1

In [255]:
df["Ratings"].value_counts()

Ratings
4    2127
3     254
2     215
0     184
1     181
Name: count, dtype: int64

In [256]:
REVIEWS = "reviews"
LABELS = "Ratings"

In [257]:
def hybrid_sample(df, target_column, strategy='median'):
    """
    Perform hybrid sampling (oversample minority and downsample majority classes)
    
    Parameters:
    @param: pd.DataFrame df: pandas DataFrame
    @param: str target_column: column with class labels
    @param: str or int strategy: 'median', 'mean', or integer for target size
                or 'downsample_only', 'oversample_only'
    @return: pd.DataFrame
    """
    # Calculate class counts
    class_counts = df[target_column].value_counts()
    
    # Determine target size based on strategy
    if isinstance(strategy, int):
        target_size = strategy
    elif strategy == 'median':
        target_size = int(class_counts.median())
    elif strategy == 'mean':
        target_size = int(class_counts.mean())
    elif strategy == 'downsample_only':
        target_size = class_counts.min()
    elif strategy == 'oversample_only':
        target_size = class_counts.max()
    else:
        raise ValueError("Invalid strategy. Use 'median', 'mean', integer, 'downsample_only', or 'oversample_only'")
    
    # Group and sample
    def sample_group(group):
        n = len(group)
        if n < target_size:  # Oversample minority classes
            return group.sample(target_size, replace=True, random_state=42)
        elif n > target_size:  # Downsample majority classes
            return group.sample(target_size, random_state=42)
        else:  # Keep as is
            return group
    
    # Return a randomized transformed dataframe
    return df.groupby(target_column, group_keys=False)[[col for col in df.columns]].apply(sample_group).sample(frac=1, random_state=42).reset_index(drop=True)

In [258]:
oversampled_df = hybrid_sample(df, target_column="Ratings", strategy="oversample_only")
oversampled_df.head()

Unnamed: 0,brand,model,price,reviews,Ratings
0,Clarks,Cotrell Edge,$69.99,These were uncomfortable right out of the box ...,1
1,Deer Stags,Status,$59.99,Doesn't fit,1
2,SKECHERS,Stamina AT Hands Free Slip-In Sneaker,$59.99,I like their look. The sides are stiffer than ...,2
3,Fila,Disruptor II Premium,$46.52,Order half size up.,4
4,etnies,Kingpin,$69.95,They Look Great but They Definitely didn't Fee...,1


In [366]:
# Building BPE (byte pair encoding) Tokenizer

class BPETokenizer:
    def __init__(self, vocab_size: int = 50, stop_word: str = "</w>"):
        self.vocab_size = vocab_size
        self.merges = {}  # Stores merge operations
        self.vocab = {"<PAD>": 0, "<UNK>": 1}  # Vocabulary with special tokens
        self.stop_word = stop_word
        
    def _get_pairs(self, word_list: list):
        """Gets the frequency pair from a wordlist"""
        pairs = Counter()
        for word in word_list:
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[(symbols[i], symbols[i+1])] += 1
        return pairs
    
    def _merge_vocab(self, word_list: list, pair: tuple):
        """Merges the most frequent pair in the vocabulary"""
        new_word_list = []
        for word in word_list:
            i = 0
            new_word = []
            while i < len(word.split()):
                if (i < len(word.split())-1 and 
                    word.split()[i] == pair[0] and 
                    word.split()[i+1] == pair[1]):
                    new_word.append(pair[0]+pair[1])
                    i += 2
                else:
                    new_word.append(word.split()[i])
                    i += 1
            new_word_list.append(" ".join(new_word))
        return new_word_list
    
    def train(self, corpus: str):
        """Train BPE on a given corpus."""
        # Preprocess the corpus
        words = corpus.split()
        word_list = [" ".join(word) + " " + self.stop_word for word in words]

        # Initialize vocabulary with characters
        for word in word_list:
            for char in word.split():
                if char not in self.vocab:
                    self.vocab[char] = len(self.vocab)

        # Perform merges until we reach the desired vocab size
        while len(self.vocab) < self.vocab_size:
            pairs = self._get_pairs(word_list)
            if not pairs:
                break
            best_pair = max(pairs, key=pairs.get)
            self.merges[best_pair] = best_pair[0] + best_pair[1]
            
            # Add the new merged token to vocabulary
            if best_pair[0] + best_pair[1] not in self.vocab:
                self.vocab[best_pair[0] + best_pair[1]] = len(self.vocab)
            
            word_list = self._merge_vocab(word_list, best_pair)

        print("Final Vocabulary:", self.vocab)
        
    def encode(self, text: str, max_length: int = None):
        """Encodes text into subword tokens and returns the tensor indices"""
        # Split into words first
        words = text.split()
        all_tokens = []
        all_token_ids = []
        
        for word in words:
            # Initialize with characters for each word
            word_processed = " ".join(word) + " " + self.stop_word
            
            # Apply all possible merges
            while True:
                pairs = self._get_pairs([word_processed])
                if not pairs:
                    break
                # Find the merge that appears first in our merge list
                best_pair = None
                for pair in self.merges:
                    if pair in pairs:
                        best_pair = pair
                        break
                if not best_pair:
                    break
                word_processed = self._merge_vocab([word_processed], best_pair)[0]
            
            tokens = word_processed.split()
            token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]
            
            all_tokens.extend(tokens)
            all_token_ids.extend(token_ids)
        
        # Add padding if max_length is specified
        if max_length is not None:
            if len(all_token_ids) < max_length:
                all_token_ids += [self.vocab["<PAD>"]] * (max_length - len(all_token_ids))
            elif len(all_token_ids) > max_length:
                all_token_ids = all_token_ids[:max_length]
                
        return all_tokens, all_token_ids
        
    def decode(self, token_ids):
        """Decode indices back to text."""
        tokens = []
        for idx in token_ids:
            for token, token_id in self.vocab.items():
                if token_id == idx:
                    tokens.append(token)
                    break
        
        # Reconstruct words by splitting at stop_word tokens
        decoded_text = []
        current_word = []
        
        for token in tokens:
            if token.endswith(self.stop_word):
                current_word.append(token.replace(self.stop_word, ""))
                decoded_text.append("".join(current_word))
                current_word = []
            else:
                current_word.append(token)
        
        # Handle any remaining tokens (if no stop_word at end)
        if current_word:
            decoded_text.append("".join(current_word))
        
        return " ".join(decoded_text)
        
    def __len__(self):
        return len(self.vocab)
    
    
    def save(self, path: str):
        """Save the tokenizer to a JSON file"""
        import json
        save_data = {
            'vocab_size': self.vocab_size,
            'stop_word': self.stop_word,
            'merges': {f"{k[0]},{k[1]}": v for k, v in self.merges.items()},
            'vocab': self.vocab
        }
        with open(path, 'w') as f:
            json.dump(save_data, f, indent=2)

        print("Succesfully saved tokenzer")

    @classmethod
    def load(cls, path: str):
        """Load a tokenizer from a JSON file"""
        import json
        with open(path, 'r') as f:
            data = json.load(f)

        tokenizer = cls(vocab_size=data['vocab_size'], stop_word=data['stop_word'])

        # Convert string keys back to tuples
        tokenizer.merges = {
            tuple(k.split(',')): v for k, v in data['merges'].items()
        }

        tokenizer.vocab = data['vocab']
        return tokenizer

In [260]:
# Test tokenizer usage
corpus = "Tokenization is the process of breaking down text into smaller units"
test_tokenizer = BPETokenizer(vocab_size=100)
test_tokenizer.train(corpus)

# Encoding example
tokens, token_ids = test_tokenizer.encode("breaking down the text")
print("Encoded Tokens:", tokens)
print("Token IDs:", token_ids)

# Decoding example
decoded_text = test_tokenizer.decode(token_ids)
print("Decoded Text:", decoded_text)

Final Vocabulary: {'<PAD>': 0, '<UNK>': 1, 'T': 2, 'o': 3, 'k': 4, 'e': 5, 'n': 6, 'i': 7, 'z': 8, 'a': 9, 't': 10, '</w>': 11, 's': 12, 'h': 13, 'p': 14, 'r': 15, 'c': 16, 'f': 17, 'b': 18, 'g': 19, 'd': 20, 'w': 21, 'x': 22, 'm': 23, 'l': 24, 'u': 25, 's</w>': 26, 'ni': 27, 'n</w>': 28, 'in': 29, 'To': 30, 'Tok': 31, 'Toke': 32, 'Tokeni': 33, 'Tokeniz': 34, 'Tokeniza': 35, 'Tokenizat': 36, 'Tokenizati': 37, 'Tokenizatio': 38, 'Tokenization</w>': 39, 'is</w>': 40, 'th': 41, 'the': 42, 'the</w>': 43, 'pr': 44, 'pro': 45, 'proc': 46, 'proce': 47, 'proces': 48, 'process</w>': 49, 'of': 50, 'of</w>': 51, 'br': 52, 'bre': 53, 'brea': 54, 'break': 55, 'breakin': 56, 'breaking': 57, 'breaking</w>': 58, 'do': 59, 'dow': 60, 'down</w>': 61, 'te': 62, 'tex': 63, 'text': 64, 'text</w>': 65, 'int': 66, 'into': 67, 'into</w>': 68, 'sm': 69, 'sma': 70, 'smal': 71, 'small': 72, 'smalle': 73, 'smaller': 74, 'smaller</w>': 75, 'uni': 76, 'unit': 77, 'units</w>': 78}
Encoded Tokens: ['breaking</w>', 'd

In [261]:
CORPUS = "\n".join(df[REVIEWS].to_list())
CORPUS[:100]

'Bought these for my son and he LOVES them!!! He wears them almost every day! I will definitely be bu'

In [336]:
# Train the tokenizer
tokenizer = BPETokenizer(vocab_size=1024)
tokenizer.train(CORPUS)
print(len(tokenizer.vocab))

Final Vocabulary: {'<PAD>': 0, '<UNK>': 1, 'B': 2, 'o': 3, 'u': 4, 'g': 5, 'h': 6, 't': 7, '</w>': 8, 'e': 9, 's': 10, 'f': 11, 'r': 12, 'm': 13, 'y': 14, 'n': 15, 'a': 16, 'd': 17, 'L': 18, 'O': 19, 'V': 20, 'E': 21, 'S': 22, '!': 23, 'H': 24, 'w': 25, 'l': 26, 'v': 27, 'I': 28, 'i': 29, 'b': 30, 'p': 31, 'C': 32, '.': 33, 'c': 34, 'D': 35, "'": 36, 'P': 37, 'A': 38, 'z': 39, 'T': 40, 'k': 41, 'W': 42, ',': 43, '1': 44, 'j': 45, 'q': 46, '…': 47, '-': 48, '&': 49, 'Z': 50, '’': 51, 'J': 52, 'x': 53, 'G': 54, '5': 55, '0': 56, '(': 57, ')': 58, ':': 59, 'F': 60, '8': 61, '+': 62, '"': 63, 'M': 64, 'N': 65, 'R': 66, '9': 67, '‘': 68, '/': 69, '2': 70, '?': 71, ';': 72, '“': 73, '”': 74, '3': 75, '4': 76, 'Y': 77, '6': 78, 'Â': 79, '½': 80, 'U': 81, 'K': 82, '=': 83, '\\': 84, '7': 85, 'á': 86, '$': 87, 'X': 88, '%': 89, '#': 90, '*': 91, 'Q': 92, '~': 93, ']': 94, '—': 95, '>': 96, '¹': 97, '[': 98, '`': 99, '–': 100, 'e</w>': 101, 'th': 102, 't</w>': 103, 's</w>': 104, 'd</w>': 105, '.

In [337]:
# Create a Dataset
class TextDataset(Dataset):
    def __init__(self, X: list, y: list, tokenizer: BPETokenizer, max_length: int=None):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        # Get current text and label
        text = self.X[idx]
        label = self.y[idx]
        
        tokens, token_ids = self.tokenizer.encode(text, self.max_length)
        
        if len(tokens) > self.max_length:
            print(f"WARNING: Token greater than max length, truncating")
        
        # Convert to tensor immediately
        token_ids = torch.tensor(token_ids, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        
        return token_ids, label
    
    @staticmethod
    def collate_fn(batch):
        # Separate inputs and labels
        input_ids = [item[0] for item in batch]
        labels = [item[1] for item in batch]
        
        # Pad sequences
        padded_inputs = pad_sequence(input_ids, batch_first=True, padding_value=0)
        
        # Stack labels (now using list instead of generator)
        labels = torch.stack(labels)
        
        return padded_inputs, labels
    
# Variables setup
X = df[REVIEWS].to_list()
y = df[LABELS].to_list()
MAX_LENGTH = 500 # Maximum sequence length
test_size = int(len(y) * 0.2)
train_size = int(len(y) - test_size)

# Initialize the dataset
full_dataset = TextDataset(X=X, y=y, tokenizer=tokenizer, max_length=MAX_LENGTH)

# Generator for spliting dataset
generator = torch.Generator().manual_seed(42)
# Train test split
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size], generator=generator)
print(f"TRAIN SIZE: {len(train_dataset)}\nTEST SIZE: {len(test_dataset)}")

TRAIN SIZE: 2369
TEST SIZE: 592


In [338]:
train_dataset[1000]

(tensor([367, 641, 699, 115, 220, 224,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0

In [339]:
tokenizer.decode(train_dataset[1000][0].tolist())

'Great shoe, light and very comfortable <PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>

In [340]:
# Dataloaders for training
BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=TextDataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=TextDataset.collate_fn)# Data

In [341]:
# Dataloader test
next(iter(test_dataloader))

(tensor([[ 561,  250,    0,  ...,    0,    0,    0],
         [ 359,  224,  115,  ...,    0,    0,    0],
         [ 116,  407,  220,  ...,    0,    0,    0],
         ...,
         [ 380,  408,  112,  ...,    0,    0,    0],
         [ 380, 1014,  187,  ...,    0,    0,    0],
         [1007,   30,  884,  ...,    0,    0,    0]]),
 tensor([4, 4, 4, 3, 4, 4, 4, 3, 4, 1, 0, 4, 4, 4, 4, 4, 4, 4, 4, 0, 2, 4, 4, 4,
         4, 4, 4, 4, 4, 1, 2, 4]))

In [342]:
# The model
torch.manual_seed(42)
torch.cuda.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Multiple convolutional layers with different filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, 
                      out_channels=num_filters, 
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x = [batch size, seq len]
        embedded = self.embedding(x)
        # embedded = [batch size, seq len, emb dim]
        
        # Conv1d expects [batch size, channels, seq len]
        embedded = embedded.permute(0, 2, 1)
        # embedded = [batch size, emb dim, seq len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        # conved_n = [batch size, num_filters, seq len - filter_size + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled_n = [batch size, num_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat = [batch size, num_filters * len(filter_sizes)]
        
        return self.fc(cat)


In [343]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# Hyperparameters
EMBEDDING_DIM = MAX_LENGTH
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(set(y))  # Number of classes
DROPOUT = 0.5
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
EPOCHS = 10

# Initialize model
model = TextCNN(
    vocab_size=len(tokenizer.vocab),
    embedding_dim=EMBEDDING_DIM,
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT
)

In [344]:
model

TextCNN(
  (embedding): Embedding(1024, 500)
  (convs): ModuleList(
    (0): Conv1d(500, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(500, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(500, 100, kernel_size=(5,), stride=(1,))
  )
  (fc): Linear(in_features=300, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [345]:
summary(model)

Layer (type:depth-idx)                   Param #
TextCNN                                  --
├─Embedding: 1-1                         512,000
├─ModuleList: 1-2                        --
│    └─Conv1d: 2-1                       150,100
│    └─Conv1d: 2-2                       200,100
│    └─Conv1d: 2-3                       250,100
├─Linear: 1-3                            1,505
├─Dropout: 1-4                           --
Total params: 1,113,805
Trainable params: 1,113,805
Non-trainable params: 0

In [346]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [372]:
# Training and test functions
def train(model, dataloader, optimizer, criterion):
    model.to(device).train()
    epoch_loss = 0
    epoch_acc = 0
    
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        
        predictions = model(X)
        loss = criterion(predictions, y)
        
        acc = accuracy_score(y.cpu().numpy(), predictions.argmax(dim=1).cpu().numpy())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
    
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.inference_mode():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            
            predictions = model(X)
            loss = criterion(predictions, y)
            
            acc = accuracy_score(y.cpu().numpy(), predictions.argmax(dim=1).cpu().numpy())
            
            epoch_loss += loss.item()
            epoch_acc += acc
    
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


In [348]:
# Training loop
for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_dataloader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

# Save the model
torch.save(model.state_dict(), 'textcnn_model.pt')

Epoch: 01
	Train Loss: 1.154 | Train Acc: 66.42%
	Test Loss: 1.073 | Test Acc: 71.22%
Epoch: 02
	Train Loss: 0.765 | Train Acc: 74.38%
	Test Loss: 1.026 | Test Acc: 71.55%
Epoch: 03
	Train Loss: 0.595 | Train Acc: 80.71%
	Test Loss: 0.855 | Test Acc: 73.19%
Epoch: 04
	Train Loss: 0.445 | Train Acc: 84.62%
	Test Loss: 0.958 | Test Acc: 72.86%
Epoch: 05
	Train Loss: 0.355 | Train Acc: 88.21%
	Test Loss: 0.938 | Test Acc: 73.36%
Epoch: 06
	Train Loss: 0.305 | Train Acc: 90.08%
	Test Loss: 0.857 | Test Acc: 73.19%
Epoch: 07
	Train Loss: 0.282 | Train Acc: 90.96%
	Test Loss: 0.904 | Test Acc: 73.03%
Epoch: 08
	Train Loss: 0.244 | Train Acc: 92.67%
	Test Loss: 1.018 | Test Acc: 73.52%
Epoch: 09
	Train Loss: 0.194 | Train Acc: 93.54%
	Test Loss: 0.944 | Test Acc: 73.03%
Epoch: 10
	Train Loss: 0.184 | Train Acc: 94.08%
	Test Loss: 0.976 | Test Acc: 73.19%


In [397]:
# Metrics
y_test = []
y_pred = []
with torch.inference_mode():
    for X, y in test_dataloader:
        X, y = X.to(device), y.to(device)
        pred = model(X)
        pred = pred.argmax(dim=1).unsqueeze(dim=1).cpu().tolist()
        y_pred.extend(pred)
        y_test.extend(y.unsqueeze(dim=1).cpu().tolist())
y_pred



[[4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [1],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [3],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [1],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [2],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [3],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [3],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [3],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [2],
 [4],
 [3],
 [4],
 [2],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [2],
 [4],
 [4],
 [3],
 [4],
 [4],
 [4],
 [1],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [0],
 [4],
 [3],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [1],
 [3],
 [4],
 [4]

In [398]:
len(y_test), len(y_pred)

(592, 592)

In [399]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7297297297297297
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.12      0.20        43
           1       0.31      0.12      0.17        34
           2       0.35      0.14      0.20        43
           3       0.33      0.18      0.23        51
           4       0.77      0.97      0.86       421

    accuracy                           0.73       592
   macro avg       0.48      0.30      0.33       592
weighted avg       0.67      0.73      0.67       592



In [364]:
tokenizer.save("tokenizer_config.json")

In [367]:
tokenizer2 = BPETokenizer.load("tokenizer_config.json")

In [368]:
tokenizer2.__dict__

{'vocab_size': 1024,
 'merges': {('e', '</w>'): 'e</w>',
  ('t', 'h'): 'th',
  ('t', '</w>'): 't</w>',
  ('s', '</w>'): 's</w>',
  ('d', '</w>'): 'd</w>',
  ('.', '</w>'): '.</w>',
  ('y', '</w>'): 'y</w>',
  ('e', 'r'): 'er',
  ('a', 'n'): 'an',
  ('i', 'n'): 'in',
  ('o', 'r'): 'or',
  ('a', 'r'): 'ar',
  ('o', '</w>'): 'o</w>',
  ('th', 'e</w>'): 'the</w>',
  ('an', 'd</w>'): 'and</w>',
  ('I', '</w>'): 'I</w>',
  ('th', 'e'): 'the',
  ('s', 'h'): 'sh',
  ('o', 'n'): 'on',
  ('a', 'l'): 'al',
  ('', '', '</w>'): ',</w>',
  ('o', 'u'): 'ou',
  ('r', 'e'): 're',
  ('f', 'or'): 'for',
  ('a', '</w>'): 'a</w>',
  ('sh', 'o'): 'sho',
  ('o', 'm'): 'om',
  ('in', 'g'): 'ing',
  ('e', 'n'): 'en',
  ('er', '</w>'): 'er</w>',
  ('t', 'o</w>'): 'to</w>',
  ('ing', '</w>'): 'ing</w>',
  ('e', 's</w>'): 'es</w>',
  ('i', 's</w>'): 'is</w>',
  ('e', 'd</w>'): 'ed</w>',
  ('f', '</w>'): 'f</w>',
  ('s', 'e</w>'): 'se</w>',
  ('T', 'h'): 'Th',
  ('h', 'a'): 'ha',
  ('o', 'o'): 'oo',
  ('v', 'e</w>