In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import re
import numpy as np
import time

from torch.utils.data import Dataset, DataLoader, TensorDataset

from tqdm import tqdm
import matplotlib.pyplot as plt
import time
from collections import deque

from sklearn.model_selection import GridSearchCV
import random


In [2]:
import random

def read_file_in_chunks(file_path, chunk_size=10000):
    chunks = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        while True:
            chunk = file.read(chunk_size).lower()
            if not chunk:
                break
            chunks.append(chunk)
    return chunks

def split_and_save_chunks(chunks, test_ratio=0.2, train_file_path='train_file.txt', test_file_path='test_file.txt'):
    random.shuffle(chunks)
    split_index = int(len(chunks) * test_ratio)

    test_chunks = chunks[:split_index]
    train_chunks = chunks[split_index:]

    with open(train_file_path, 'w', encoding='utf-8', errors='ignore') as train_file:
        for chunk in train_chunks:
            train_file.write(chunk)

    with open(test_file_path, 'w', encoding='utf-8', errors='ignore') as test_file:
        for chunk in test_chunks:
            test_file.write(chunk)

    train_length = sum(len(chunk) for chunk in train_chunks)
    test_length = sum(len(chunk) for chunk in test_chunks)

    return len(train_chunks), len(test_chunks), train_length, test_length

# Example usage
file_path = 'HarryPotter.txt'
chunk_size = 10000  # Specify the chunk size
chunks = read_file_in_chunks(file_path, chunk_size)

total_text_length = sum(len(chunk) for chunk in chunks)
num_chunks = len(chunks)

# Split the chunks and save them to separate files
train_chunks, test_chunks, train_length, test_length = split_and_save_chunks(chunks, test_ratio=0.05, train_file_path='train_file1.txt', test_file_path='test_file1.txt')

print("Total text length:", total_text_length)
print("Total train text lenth:", train_length)
print("Total train test lenth:", test_length)
print('\n')

print("Number of chunks:", num_chunks)
print("Number of chunks in training file:", train_chunks)
print("Number of chunks in test file:", test_chunks)



FileNotFoundError: [Errno 2] No such file or directory: 'HarryPotter.txt'

In [3]:

class TextProcessor:
    def __init__(self, file_path, sequence_length):
        self.vocab = []
        self._i2w = {}
        self._w2i = {}

        self.file_path = file_path
        self.unprocessed_text = self._load_file(self.file_path)
        self.text = self.process_text(self.unprocessed_text)
        self.create_vocab(self.text)

        self.sequence_length = sequence_length
        self._vocablength = len(self.vocab)
        print(self._vocablength)

    def _load_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            return file.read().lower()

    def _clean_text(self, text):
        text = re.sub(r"[^a-zA-Z',.!]", ' ', text)
        return text

    def _tokenize(self, text):
        tokens = re.findall(r"\b\w+(?:'\w+)?\b|[.!,?]", text)
        return tokens

    def process_text(self, text):
        cleaned_text = self._clean_text(text)
        tokens = self._tokenize(cleaned_text)
        return tokens

    def create_vocab(self, text):
        # Add an unknown word
        self.vocab.append('<UNK>')
        self._i2w[0] = '<UNK>'
        self._w2i['<UNK>'] = 0

        idx = 1
        for i, word in enumerate(text):
            if word not in self.vocab:
                self.vocab.append(word)
                self._i2w[idx] = word
                self._w2i[word] = idx
                idx += 1

    def text_to_indices(self, text):
        indices = []
        for word in text:
            if word in self._w2i:
                indices.append(self._w2i[word])
            else:
                indices.append(0)
        return indices

    def generate_sequences(self):
        sequences = []
        for i in range(0, len(self.text) - self.sequence_length):
            sequences.append(self.text_to_indices(self.text[i:i + self.sequence_length]))
        return np.array(sequences)

    def generate_input_target_sequences(self):
        sequences = self.generate_sequences()
        inputs = sequences[:, :-1]  # Exclude the last word from inputs
        targets = sequences[:, -1]   # Exclude the first word from targets
        return inputs, targets

    def generate_dataloader(self, batch_size):
        inputs, targets = self.generate_input_target_sequences()

        # Convert numpy arrays to PyTorch tensors
        inputs = torch.LongTensor(inputs)
        targets = torch.LongTensor(targets)

        # Create DataLoader for batching and shuffling data
        dataset = TensorDataset(inputs, targets)
        train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        return train_loader




In [4]:
tp = TextProcessor('test_file.txt', 8)

6012


In [19]:
len(tp.text)

62983

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import re
import numpy as np
import time

from torch.utils.data import Dataset, DataLoader, TensorDataset

from tqdm import tqdm
import matplotlib.pyplot as plt
import time
from collections import deque

from sklearn.model_selection import GridSearchCV
import random


In [2]:

class DeepNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_size=256, num_layers=2, drop_out=0.0):
        super(DeepNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, dropout=drop_out)
        self.final = nn.Linear(hidden_size, vocab_size)

    def forward(self, t, hidden):
        embeddings = self.embedding(t)
        out, hidden = self.rnn(embeddings, hidden)
        out = self.final(out[:, -1, :])
        return out, hidden



In [11]:
class Evaluate:
    def __init__(self, file_path, device='cpu'):
        self.device = device
        self.load_model(file_path)

    def load_model(self, file_path):
        checkpoint = torch.load(file_path, map_location=self.device)

        # Reconstruct the text processor
        self._w2i = checkpoint['w2i']
        self._i2w = checkpoint['i2w']

        # Retrieve model parameters
        embedding_dim = checkpoint['embedding_dim']
        hidden_size = checkpoint['hidden_size']
        num_layers = checkpoint['num_layers']
        vocab_size = checkpoint['vocab_size']
        drop_out = checkpoint['drop_out']

        # Reconstruct the model with parameters loaded from the checkpoint
        self.model = DeepNetwork(vocab_size, embedding_dim, hidden_size, num_layers=num_layers).to(self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()

        # Load losses and accuracies
        self.losses = checkpoint['losses']
        self.accuracies = checkpoint['accuracies']
        self.accuracies_words = checkpoint['accuracies_words']
        self.epoch_acc = checkpoint['epoch_acc']

        self.sequence_length = checkpoint['sequence_length']
        self.num_layers = checkpoint['num_layers']
        self.hidden_size = checkpoint['hidden_size']
        self.eval_text = checkpoint['eval_text']

        print(f"Model loaded from {file_path}")

    def generate_text(self, seed_text, next_words=50):
        self.model.eval()
        words = seed_text.lower().split()
        hidden = None

        for _ in range(next_words):
            input_indices = [self._w2i[word] if word in self._w2i else 0 for word in words[-self.sequence_length:]]
            input_tensor = torch.LongTensor([input_indices]).to(self.device)

            with torch.no_grad():
                output = self.model(input_tensor)

            output = output.squeeze(0)[-1]
            probabilities = F.softmax(output, dim=0).cpu().numpy()

            top_indices = np.argsort(probabilities)[-5:][::-1]
            top_words = [self._i2w[idx] for idx in top_indices]
            top_probs = probabilities[top_indices]

            predicted_index = np.random.choice(top_indices, p=top_probs/top_probs.sum())
            predicted_word = self._i2w[predicted_index]
            words.append(predicted_word)

        return ' '.join(words)

    def _generate_words(self, input_indices, neighbours, hidden):
        input_tensor = torch.LongTensor([input_indices]).to(self.device)
        with torch.no_grad():
            output, hidden = self.model(input_tensor, hidden)

        output = output.squeeze(0)

        probabilities = F.softmax(output, dim=0).cpu().numpy()

        top_indices = np.argsort(probabilities)[-neighbours:][::-1]
        top_words = [self._i2w[idx] for idx in top_indices]
        top_probs = probabilities[top_indices]

        return top_indices, top_words, hidden


    def evaluate_text(self, neighbours=100):
        start = self.eval_text[0]
        saved_keystrokes = 0
        total_keystrokes = 0

        found_words = 0
        total_words = 0

        input_indices = deque([start], maxlen=self.sequence_length-1)

        for index in self.eval_text[1:]:
            hidden = torch.zeros(self.num_layers, 1, self.hidden_size).to(self.device)
            top_indices, top_words, hidden = self._generate_words(input_indices, neighbours, hidden)

            fw, sk = self.evaluate_options(self._i2w[index], top_words)

            saved_keystrokes += sk
            found_words += fw

            total_words += 1
            total_keystrokes += len(self._i2w[index])  # Adjust index to get the actual item
            input_indices.append(index)  # Adjust index to get the actual item

        print(f'Saved keystroke percentage: {100*saved_keystrokes/total_keystrokes:.2f}%')
        print(f'Found words percentage: {100*found_words/total_words:.2f}%')
        return saved_keystrokes/total_keystrokes, found_words/total_words


    def evaluate_options(self, target_word, options, nr_of_suggestions=5):
        """Target word is the next word, options is a list of the k most probable words according to the model
        Outputs found_word which is 1 if the target_word is in options else 0
        Outputs saved_keystrokes which is the amount of saved keystrokes, 0 if target_word not in options"""
        if target_word in options:
            if target_word in options[:nr_of_suggestions]:
                found_word = 1
                saved_keystrokes = len(target_word)
                return found_word, saved_keystrokes
            for len_of_word in range(len(target_word)+1):
                options = [word for word in options if word[:len_of_word] == target_word[:len_of_word]]
                if target_word in options[:nr_of_suggestions]:
                    found_word = 1
                    saved_keystrokes = len(target_word) - len_of_word
                    return found_word, saved_keystrokes
            return (0, 0)

        else:
            found_word = 0
            saved_keystrokes = 0
            return found_word, saved_keystrokes




In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [13]:
evale = Evaluate('model.pth', device)

Model loaded from model.pth


In [14]:
evale.evaluate_text(neighbours=5)

evale.evaluate_text(neighbours=20)

evale.evaluate_text(neighbours=50)

evale.evaluate_text(neighbours=100)

evale.evaluate_text(neighbours=200)

evale.evaluate_text(neighbours=500)

evale.evaluate_text(neighbours=1000)

evale.evaluate_text(neighbours=2000)

Saved keystroke percentage: 28.81%
Found words percentage: 42.47%
Saved keystroke percentage: 40.83%
Found words percentage: 59.67%
Saved keystroke percentage: 49.32%
Found words percentage: 70.12%
Saved keystroke percentage: 55.16%
Found words percentage: 76.95%
Saved keystroke percentage: 60.04%
Found words percentage: 82.50%
Saved keystroke percentage: 65.22%
Found words percentage: 88.32%
Saved keystroke percentage: 68.10%
Found words percentage: 91.58%
Saved keystroke percentage: 70.27%
Found words percentage: 94.16%


(0.7027321874469684, 0.9415547299228351)

In [None]:
evale = Evaluate('model.pth', device)
evale.evaluate_text(neighbours=100)