# Test Module

In [6]:
from byte_pair_encoding.bpe import BytePairEncoder
import nltk
import os
import json
from collections import Counter

## BPE Tokenization

In [7]:
# Load the trained BPE vocabulary
vocab_file = 'dataset/train/bpe_train_vocab.txt'
bpe = BytePairEncoder()
with open(vocab_file, 'r', encoding='utf-8') as f:
    vocab = {}
    for line in f:
        token, freq = line.strip().split(': ')
        vocab[token] = int(freq)
    bpe.vocab = vocab

In [8]:
# Directory containing the test books
test_dir = 'dataset/test'

### Default Tokenization

In [9]:
# Initialize NLTK's Punkt tokenizer
def_tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
# Store a reference to the test default tokens
ref_test_tokens = {}

### Evaluate on the test dataset

In [10]:
# Evaluate on test books
total_tokens = 0
correct_tokens = 0
total_words = 0

for file_name in os.listdir(test_dir):
    with open(os.path.join(test_dir, file_name), 'r', encoding='utf-8') as file:
        text = file.read()
        
        # Encode the text using the trained BPE vocabulary
        encoded_text = bpe.encode(text)

        # Decode the encoded text
        decoded_text = bpe.decode(encoded_text)

        # BPE Tokenization accuracy and coverage
        original_tokens = len(text.split())
        total_tokens += original_tokens
        decoded_tokens = len(decoded_text.split())
        correct_tokens += sum(1 for token in decoded_text.split() if token.isdigit())
        total_words += len(decoded_text.split())

        print(f"File: {file_name}")
        print(f"Original Text:\n{text[:100]}...")
        print(f"Decoded Text:\n{decoded_text[:100]}...")
        print("")


File: sherlock-holmes.txt
Original Text:
﻿The Project Gutenberg eBook of The Adventures of Sherlock Holmes
    
This ebook is for the use of ...
Decoded Text:
﻿T<31> Project Gu<4416><106>rg eBook <13> T<31> Adventures <13> <15600><31>rlock Holmes
    
This e<...

File: frankenstein.txt
Original Text:
﻿The Project Gutenberg eBook of Frankenstein; Or, The Modern Prometheus
    
This ebook is for the u...
Decoded Text:
﻿T<31> Project Gu<4416><106>rg eBook <13> <15345>rank<10387>ste<121>; Or<9431> T<31> <15792>o<15751>...

File: dracula.txt
Original Text:
﻿The Project Gutenberg eBook of Dracula
    
This ebook is for the use of anyone anywhere in the Uni...
Decoded Text:
﻿T<31> Project Gu<4416><106>rg eBook <13> <12748>acula
    
This e<1807> is for <18> <901>e <13> any...



### BPE Metrics

In [11]:
# Calculate metrics
original_tokens_set = set(text.split())
decoded_tokens_set = set(decoded_text.split())

TP = len(decoded_tokens_set.intersection(original_tokens_set))
FP = len(decoded_tokens_set - original_tokens_set)
FN = len(original_tokens_set - decoded_tokens_set)

# Calculate precision, recall, F1 score
precision = TP / (TP + FP) if TP + FP > 0 else 0
recall = TP / (TP + FN) if TP + FN > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
# Calculate Jaccard similarity
jaccard_similarity = len(decoded_tokens_set.intersection(original_tokens_set)) / len(decoded_tokens_set.union(original_tokens_set))
# Calculate tokenization accuracy
tokenization_accuracy = correct_tokens / total_tokens * 100
# Calculate tokenization coverage
coverage = (len(decoded_tokens_set) / len(original_tokens_set)) * 100

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")
print(f"Jaccard Similarity: {jaccard_similarity:.2f}")
print(f"Tokenization Accuracy: {tokenization_accuracy:.2f}%")
print(f"Coverage: {coverage:.2f}%")


Precision: 0.40
Recall: 0.40
F1 Score: 0.40
Jaccard Similarity: 0.25
Tokenization Accuracy: 0.04%
Coverage: 100.00%


## Reference Tokenization

In [12]:
for file_name in os.listdir(test_dir):
    with open(os.path.join(test_dir, file_name), 'r', encoding='utf-8') as file:
        text = file.read()
        
        # Tokenize with the default punkt tokenizer
        def_tokenized_text = def_tokenizer.tokenize(text)
        ref_test_tokens[file_name] = def_tokenized_text

### Save Reference Tokens

In [13]:
# Save the tokenized results in a structured format
output_file = 'dataset/reference_punkt_tokens.json'
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(ref_test_tokens, file, indent=4)

print(f"Reference tokenization saved to {output_file}")

Reference tokenization saved to dataset/reference_punkt_tokens.json


## Baseline Tokenization

In [14]:
# Load reference tokenization
with open('dataset/reference_punkt_tokens.json', 'r', encoding='utf-8') as file:
    reference_tokenization = json.load(file)

In [15]:
tokenizer = nltk.tokenize.word_tokenize

In [16]:
# Initialize variables for evaluation metrics
total_tokens_bpe = 0
correct_tokens_bpe = 0
total_tokens_default = 0
correct_tokens_default = 0

In [17]:
# Evaluate BPE algorithm and NLTK's default tokenizer
for file_name, tokenized_text in reference_tokenization.items():
    with open(os.path.join(test_dir, file_name), 'r', encoding='utf-8') as file:
        text = file.read()

        # Tokenize with BPE algorithm
        encoded_text = bpe.encode(text)
        decoded_text = bpe.decode(encoded_text)
        decoded_tokens_bpe = decoded_text.split()

        # Tokenize with NLTK's default method
        default_tokens = tokenizer(text)

        # Calculate metrics for BPE
        total_tokens_bpe += len(decoded_tokens_bpe)
        correct_tokens_bpe += sum(1 for token in decoded_tokens_bpe if token in tokenized_text)

        # Calculate metrics for NLTK's default method
        total_tokens_default += len(default_tokens)
        correct_tokens_default += sum(1 for token in default_tokens if token in tokenized_text)


In [18]:
# Calculate accuracy, coverage, precision, recall, F1 score, and Jaccard similarity for both methods
accuracy_bpe = (correct_tokens_bpe / total_tokens_bpe) * 100 if total_tokens_bpe > 0 else 0
accuracy_default = (correct_tokens_default / total_tokens_default) * 100 if total_tokens_default > 0 else 0

precision_bpe = (correct_tokens_bpe / total_tokens_bpe) if total_tokens_bpe > 0 else 0
precision_default = (correct_tokens_default / total_tokens_default) if total_tokens_default > 0 else 0

recall_bpe = (correct_tokens_bpe / sum(len(tokenized_text) for tokenized_text in reference_tokenization.values())) if total_tokens_bpe > 0 else 0
recall_default = (correct_tokens_default / sum(len(tokenized_text) for tokenized_text in reference_tokenization.values())) if total_tokens_default > 0 else 0

f1_score_bpe = 2 * (precision_bpe * recall_bpe) / (precision_bpe + recall_bpe) if (precision_bpe + recall_bpe) > 0 else 0
f1_score_default = 2 * (precision_default * recall_default) / (precision_default + recall_default) if (precision_default + recall_default) > 0 else 0

jaccard_similarity_bpe = correct_tokens_bpe / total_tokens_bpe if total_tokens_bpe > 0 else 0
jaccard_similarity_default = correct_tokens_default / total_tokens_default if total_tokens_default > 0 else 0


In [19]:
print("BPE Algorithm Metrics:")
print(f"Accuracy: {accuracy_bpe:.2f}%")
print(f"Precision: {precision_bpe:.2f}")
print(f"Recall: {recall_bpe:.2f}")
print(f"F1 Score: {f1_score_bpe:.2f}")
print(f"Jaccard Similarity: {jaccard_similarity_bpe:.2f}")

print("\nNLTK's Default Tokenizer Metrics:")
print(f"Accuracy: {accuracy_default:.2f}%")
print(f"Precision: {precision_default:.2f}")
print(f"Recall: {recall_default:.2f}")
print(f"F1 Score: {f1_score_default:.2f}")
print(f"Jaccard Similarity: {jaccard_similarity_default:.2f}")

BPE Algorithm Metrics:
Accuracy: 0.24%
Precision: 0.00
Recall: 0.05
F1 Score: 0.00
Jaccard Similarity: 0.00

NLTK's Default Tokenizer Metrics:
Accuracy: 0.20%
Precision: 0.00
Recall: 0.05
F1 Score: 0.00
Jaccard Similarity: 0.00
