# Model and Plaintext Recovery
The first segment of this notebook involves the definition and training of the character level language model. The second segment contains the implementation of the plaintext recovery of the test set.

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [2]:
!pip install -U skorch
import skorch
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

Collecting skorch
[?25l  Downloading https://files.pythonhosted.org/packages/42/21/4936b881b33de285faa0b36209afe4f9724a0875b2225abdc63b23d384a3/skorch-0.8.0-py3-none-any.whl (113kB)
[K     |██▉                             | 10kB 27.4MB/s eta 0:00:01[K     |█████▊                          | 20kB 6.1MB/s eta 0:00:01[K     |████████▋                       | 30kB 7.5MB/s eta 0:00:01[K     |███████████▌                    | 40kB 8.3MB/s eta 0:00:01[K     |██████████████▍                 | 51kB 7.1MB/s eta 0:00:01[K     |█████████████████▎              | 61kB 8.1MB/s eta 0:00:01[K     |████████████████████▏           | 71kB 8.3MB/s eta 0:00:01[K     |███████████████████████         | 81kB 8.7MB/s eta 0:00:01[K     |██████████████████████████      | 92kB 8.1MB/s eta 0:00:01[K     |████████████████████████████▉   | 102kB 8.3MB/s eta 0:00:01[K     |███████████████████████████████▊| 112kB 8.3MB/s eta 0:00:01[K     |████████████████████████████████| 122kB 8.3MB/s 
Install

In [3]:
from google.colab import drive
import tarfile
import io
import torch
import torch.nn as nn
import string
import unicodedata
import time
import math
import random
import os
import numpy as np
import unicodedata
from torch.utils.data import Dataset, DataLoader
from skorch import helper, callbacks

In [4]:
drive.mount('/content/gdrive')



Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
!cp '/content/gdrive/My Drive/smaller_subset-cleaned.zip' 'smaller_subset-cleaned.zip'
!ls -al

total 4288
drwxr-xr-x 1 root root    4096 Aug  6 03:43 .
drwxr-xr-x 1 root root    4096 Aug  6 03:34 ..
drwxr-xr-x 1 root root    4096 Aug  3 16:17 .config
drwx------ 4 root root    4096 Aug  6 03:43 gdrive
drwxr-xr-x 1 root root    4096 Jul 30 16:30 sample_data
-rw------- 1 root root 4368427 Aug  6 03:43 smaller_subset-cleaned.zip


In [6]:
!unzip smaller_subset-cleaned.zip

Archive:  smaller_subset-cleaned.zip
   creating: smaller_subset/
   creating: smaller_subset/may-l/
   creating: smaller_subset/may-l/deleted_items/
  inflating: smaller_subset/may-l/deleted_items/6  
  inflating: smaller_subset/may-l/deleted_items/9  
  inflating: smaller_subset/may-l/deleted_items/2  
  inflating: smaller_subset/may-l/deleted_items/3  
  inflating: smaller_subset/may-l/deleted_items/8  
  inflating: smaller_subset/may-l/deleted_items/4  
  inflating: smaller_subset/may-l/deleted_items/1  
  inflating: smaller_subset/may-l/deleted_items/10  
  inflating: smaller_subset/may-l/deleted_items/11  
  inflating: smaller_subset/may-l/deleted_items/5  
   creating: smaller_subset/may-l/sent_items/
  inflating: smaller_subset/may-l/sent_items/33  
  inflating: smaller_subset/may-l/sent_items/6  
  inflating: smaller_subset/may-l/sent_items/24  
  inflating: smaller_subset/may-l/sent_items/36  
  inflating: smaller_subset/may-l/sent_items/48  
  inflating: smaller_subset/may-l

# Character Level LSTM Language Model

In [7]:
# define Model
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, num_of_layers, seq_len, dropout, padding_idx=0):
        super(Model, self).__init__()
        self.seq_len = seq_len
        self.hidden_dim = hidden_dim
        self.num_of_layers = num_of_layers
        self.embedding = nn.Embedding(input_size, input_size, padding_idx=padding_idx)
        self.input_dropout = nn.Dropout(dropout, inplace=False)
        # Hidden LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_dim, num_of_layers, batch_first = True)
        # Fully connected output layer
        self.fc2 = nn.Linear(hidden_dim, output_size)
        self.norm = nn.LayerNorm(output_size)

    def forward(self, inp, hidden, seq_lengths):
        inp = self.embedding(inp)
        inp = self.input_dropout(inp)
        max_seq_length = max(seq_lengths)
        inp = nn.utils.rnn.pack_padded_sequence(inp, seq_lengths, enforce_sorted=False, batch_first=True) # pad sequences
        packed_output, new_hidden = self.lstm(inp, hidden)
        output, inp_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, padding_value=0, batch_first=True, total_length=self.seq_len) # unpack sequences back to tensors
        output = output.to(device)
        output_fc = self.fc2(output) # output with correct dimensions
        output_norm = self.norm(output_fc)
        return (output_norm, new_hidden)
    
    def initState(self, batch_size):
        return (torch.zeros(self.num_of_layers, batch_size, self.hidden_dim),
                torch.zeros(self.num_of_layers, batch_size, self.hidden_dim))

class LSTMNet(skorch.net.NeuralNet):
    def on_epoch_begin(self, *args, **kwargs):
        super().on_epoch_begin(*args, **kwargs)
      
        self.hidden = self.module_.initState(self.batch_size)

    def train_step(self, X, y):
        self.module_.train()
        self.optimizer_.zero_grad()
        inp = torch.stack(X[0]).transpose(0,1)
        inp_seq_len = X[1]
        target = torch.stack(y[0]).transpose(0,1)
        target_seq_len = y[1]
        inp = inp.to(device)
        target = nn.utils.rnn.pad_sequence(target, batch_first=True)
        target = target.to(device)
        

        hidden = tuple([each.data.to(device) for each in self.hidden])

        output, self.hidden = self.module_(inp, hidden, inp_seq_len)
        criterion_input = output.transpose(1,2)
        criterion_input.to(device)
        loss = self.get_loss(criterion_input, target)
        loss.to(device)
        loss.backward(retain_graph=True)
        self.optimizer_.step()

        return { 'loss': loss, 'y_pred': output }
    
    def validation_step(self, X, y):
        self.module_.eval()
        with torch.no_grad():
            inp = torch.stack(X[0]).transpose(0,1)
            inp_seq_len = X[1]
            target = torch.stack(y[0]).transpose(0,1)
            target_seq_len = y[1]
            inp = inp.to(device)
            target = target.to(device)

            hidden = self.module_.initState(self.batch_size)
            hidden = tuple([each.data.to(device) for each in hidden])
            output, _ = self.module_(inp, hidden, inp_seq_len)
            criterion_input = output.transpose(1, 2)
            target = nn.utils.rnn.pad_sequence(target, batch_first=True)
            loss = self.get_loss(criterion_input, target)
            
            return {'loss': loss, 'y_pred': output }
        
    def evaluation_step(self, X, **kwargs):
        self.module_.eval()
        with torch.no_grad():
            inp = torch.stack(X[0]).transpose(0,1)
            inp_seq_len = X[1]
            inp = inp.to(device)

            hidden = self.module_.initState(self.batch_size)
            hidden = tuple([each.data.to(device) for each in hidden])
            output, _ = self.module_(inp, hidden, inp_seq_len)
            return output

In [8]:
# num_of_letters = len(letters) + 3
# padding index is 0
separator = '{sep}'
start_of_file = '{start}' # index is num_of_letters - 2
end_of_file = '{end}' # index is num_of_letters - 1

# read data
def readFromFile(input_filename, output_filename, max_length):
    # total length of line will be max_length + 1 (last letter is for creation of target tensor)
    try:
        text = open(input_filename, encoding='ascii').read().strip()
    except:
        print("Could not open in ascii: " + input_filename)
        return 0, ''
    with open(output_filename, 'a') as output:
        line = start_of_file + text[0: max_length] + separator
        letters = ''.join(set(line))
        counter = 1
        output.write(line)
        for i in range(max_length - 1, len(text), max_length):
            line = text[i: i + max_length + 1]
            letters = ''.join(set(letters + line))
            if i >= len(text) - max_length:
                line += end_of_file
            line += separator
            counter += 1
            output.write(line)
        output.close()
    return counter, letters

def writeFilenameToFile(test_file, filename):
    try:
        text = open(filename, encoding='ascii').read().strip() # ensure it is readable in ascii
    except:
        return
    with open(test_file, 'a') as output:
        output.write(filename + "\n")
    output.close()

def readLinesFromData(directory, output_filename, max_length, test_file, for_every):
    files = readFilesFromData(directory)
    letters = ''
    num_of_lines = 0
    random.shuffle(files) # shuffle files
    for i in range(len(files)):
        filename = files[i]
        if i % for_every == 0:
            writeFilenameToFile(test_file, filename)
        else:
            num_from_file, letters_from_file = readFromFile(filename, output_filename, max_length)
            letters = ''.join(set(letters + letters_from_file))
            num_of_lines += num_from_file
    return num_of_lines, letters

def readFilesFromData(directory_path):
    all_files = []
    for root, directories, filenames in os.walk(directory_path):
        for directory in directories:
            all_files.extend(readFilesFromData(directory))
        for filename in filenames:
            all_files.append(os.path.join(root, filename))
    return all_files


class CharacterDataset(Dataset):
    def __init__(self, input_dir, max_length, for_every=0):
        self.input_dir = input_dir
        self.max_length = max_length

        self.processed_file = os.path.join(os.getcwd(), 'enron_processed.txt')
        self.test_file = os.path.join(os.getcwd(), 'enron_test_files.txt')
        
        if os.path.exists(self.processed_file):
            os.remove(self.processed_file)    
        if os.path.exists(self.test_file):
            os.remove(self.test_file)
        
        self.num_of_lines, self.letters = readLinesFromData(input_dir, self.processed_file, max_length, self.test_file, for_every)
        print("number of lines: " + str(self.num_of_lines))
        self.num_of_letters = len(self.letters) + 3 # plus three for padding character, start_of_file marker and end_of_file marker
        self.split_text = open(self.processed_file, encoding='ascii').read().split(separator)

    def getLetterIndices(self, line):
        letter_indices = []
        start_index = line.find(start_of_file)
        end_index = line.find(end_of_file)
        letter_index = 0
        while letter_index < len(line):
            if letter_index == start_index:
                letter_indices.append(self.num_of_letters - 2)
                letter_index += len(start_of_file)
            elif letter_index == end_index:
                letter_indices.append(self.num_of_letters - 1)
                letter_index += len(end_of_file)
            else:
                letter = line[letter_index]
                letter_indices.append(self.letters.find(letter) + 1) #shifted by 1 because of padding marker
                letter_index += 1
        original_length = len(letter_indices)
        return letter_indices, original_length

    # Input tensor contains indices of letters excluding the last letter in the line
    def createInputTensor(self, line, max_length):
        letter_indices, length = self.getLetterIndices(line)
        inp_indices = letter_indices[:-1]
        inp_length = length - 1
        while len(inp_indices) < max_length:
            inp_indices.append(0)
        return (inp_indices, inp_length)

    # Target tensor contains indices of letters in the input tensor 
    # excluding the first letter
    def createTargetTensor(self, line, max_length):
        letter_indices, length = self.getLetterIndices(line)
        target_indices = letter_indices[1:]
        target_length = length - 1
        while len(target_indices) < max_length:
            target_indices.append(0)
        return (target_indices, target_length)

    def __len__(self):
        return len(self.split_text)

    def __getitem__(self, index):
        line = self.split_text[index]
        input = self.createInputTensor(line, self.max_length)
        target = self.createTargetTensor(line, self.max_length)
        return input, target
    
    def getLetters(self):
        return self.letters, self.num_of_letters


def getMatchesAndTotal(output, target_seq):
    prob = nn.functional.softmax(torch.from_numpy(output), dim=2) 
    char_indices = torch.max(prob, dim=2)[1]
    target_seq = torch.tensor(target_seq)
    target_seq = target_seq[:char_indices.size(0)]
    matches = torch.eq(char_indices, target_seq).sum().item()
    total = torch.numel(char_indices)
    return matches, total

def ds_accuracy(net, ds, y=None):
    y_true = [y[0] for _, y in ds]
    y_pred = net.predict(ds)
    matches, total = getMatchesAndTotal(y_pred, y_true)
    return (matches / total) * 100

In [9]:
seq_len = 150
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

def initState(batch_size, num_of_layers, hidden_dim):
    return (torch.zeros(num_of_layers, batch_size, hidden_dim),
            torch.zeros(num_of_layers, batch_size, hidden_dim))
dataset = CharacterDataset('smaller_subset', seq_len, for_every=30)
letters, num_of_letters = dataset.getLetters()

print("letters: " + letters)

batch_size = 256
num_of_epochs = 50
learning_rate = 0.001
hidden_dim = int((2/3) * num_of_letters + num_of_letters)
num_of_layers = 2
dropout=0.2
init_state = initState(batch_size, num_of_layers, hidden_dim)
net = LSTMNet(
    module=Model,
    module__input_size=num_of_letters,
    module__output_size=num_of_letters,
    module__hidden_dim = hidden_dim,
    module__num_of_layers=num_of_layers,
    module__seq_len=seq_len,
    module__dropout=dropout,
    module__padding_idx=0,
    device=device,
    criterion=torch.nn.CrossEntropyLoss,
    criterion__ignore_index=0,
    optimizer=torch.optim.Adam,
    optimizer__lr=learning_rate,
    batch_size=batch_size,
    max_epochs=num_of_epochs,
    train_split=skorch.dataset.CVSplit(10),
    callbacks=[callbacks.EpochScoring(ds_accuracy, use_caching=False), 
        callbacks.EarlyStopping(patience=30, threshold_mode='abs'), 
        callbacks.Checkpoint(dirname='checkpoints'),
        callbacks.TrainEndCheckpoint(dirname='checkpoints'),
        callbacks.LoadInitState(callbacks.Checkpoint(dirname='checkpoints'))],
    iterator_train__drop_last=True,
    iterator_valid__drop_last=True
    )

GPU is available
number of lines: 67463
letters: g`+U'?"1fM{;)RvA5:Hj2
K30\G*Yw=D#9XQ$a4kFPT>, (n/	6rlzV-y&cZes!S@Bi]8_}Ed7xm~hObqJ|I<.CWp[%uotNL


Model Training is done in the next three cells.

Do not run the following three cells if just doing prediction.

In [10]:
def main():
    with open("letters.txt", 'w') as letters_file:
        letters_file.write(letters)

    print("number_of_letters: " + str(num_of_letters))
 
    print("Finished preprocessing....")

    # hyperparameters
    with torch.autograd.set_detect_anomaly(True):
        net.set_params(device=device)
        net.fit(dataset)
    
if __name__ == "__main__":
    main()

number_of_letters: 99
Finished preprocessing....
  epoch    ds_accuracy    train_loss    valid_loss    cp      dur
-------  -------------  ------------  ------------  ----  -------
      1        [36m60.3869[0m        [32m2.1114[0m        [35m1.5262[0m     +  26.6623


In [11]:
!zip -r checkpoints.zip checkpoints/
from google.colab import files
files.download("checkpoints.zip")
files.download("enron_processed.txt")
files.download("enron_test_files.txt")

  adding: checkpoints/ (stored 0%)
  adding: checkpoints/history.json (deflated 81%)
  adding: checkpoints/train_end_optimizer.pt (deflated 9%)
  adding: checkpoints/params.pt (deflated 8%)
  adding: checkpoints/train_end_history.json (deflated 82%)
  adding: checkpoints/train_end_params.pt (deflated 8%)
  adding: checkpoints/optimizer.pt (deflated 8%)


KeyboardInterrupt: ignored

In [None]:
files.download("letters.txt")

# Plaintext recovery


In [12]:
# Upload checkpoints
!cp '/content/gdrive/My Drive/checkpoints.zip' 'checkpoints.zip'
!cp '/content/gdrive/My Drive/enron_processed.txt' 'enron_processed.txt'
!cp '/content/gdrive/My Drive/enron_test_files.txt' 'enron_test_files.txt'
!cp '/content/gdrive/My Drive/letters.txt' 'letters.txt'
!ls -al
!unzip checkpoints.zip

total 23704
drwxr-xr-x 1 root root     4096 Aug  6 03:44 .
drwxr-xr-x 1 root root     4096 Aug  6 03:34 ..
drwxr-xr-x 2 root root     4096 Aug  6 03:44 checkpoints
-rw-r--r-- 1 root root  9523230 Aug  6 03:44 checkpoints.zip
drwxr-xr-x 1 root root     4096 Aug  3 16:17 .config
-rw-r--r-- 1 root root 10333672 Aug  6 03:44 enron_processed.txt
-rw-r--r-- 1 root root     4188 Aug  6 03:44 enron_test_files.txt
drwx------ 4 root root     4096 Aug  6 03:43 gdrive
-rw-r--r-- 1 root root       96 Aug  6 03:44 letters.txt
drwxr-xr-x 1 root root     4096 Jul 30 16:30 sample_data
drwxr-xr-x 5 root root     4096 Jul 30 13:00 smaller_subset
-rw------- 1 root root  4368427 Aug  6 03:43 smaller_subset-cleaned.zip
Archive:  checkpoints.zip
replace checkpoints/train_end_params.pt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: checkpoints/train_end_params.pt  
  inflating: checkpoints/optimizer.pt  
  inflating: checkpoints/train_end_history.json  
  inflating: checkpoints/params.pt   
  inflating

In [13]:
import torch
from torch.nn import Softmax
import os
import skorch
from skorch import callbacks
from torch.utils.data import Dataset, DataLoader
from skorch.callbacks import Checkpoint
import string
import numpy as np
from skorch.helper import SliceDataset
import math
import time
import cProfile

letters = open('letters.txt', 'r').read()
num_of_letters = len(letters) + 3
cp = Checkpoint(dirname='checkpoints', fn_prefix='train_end_')
net.initialize()
net.load_params(checkpoint=cp)
model1, model2 = net, net

def getLongerString(string1, string2):
    if len(string1) > len(string2):
        return string1
    else:
        return string2

def xor_string(string1, string2):
    xored = []
    longer_string = getLongerString(string1, string2)
    for i in range(min(len(string1), len(string2))):
        xored_value = chr(ord(string1[i]) ^ ord(string2[i]))
        xored.append(xored_value)
    for j in range(min(len(string1), len(string2)), len(longer_string)):
        xored.append(longer_string[j])
    return ''.join(xored).encode('ascii')

# Ciphers will be generated up to the minimum of the length of the shorter file and the variable length (if provided)
def createCipherTexts(test_files_path, length=None):
    ciphers = [] # (original_plaintext1, original_plaintext2, cipher)
    with open(test_files_path, 'r') as f:
        files = f.readlines()
        f.close()
        for i in range(0, len(files), 2):
            if i + 1 >= len(files):
                # take note of outstanding files
                with open("outstanding.txt", 'a+') as outstanding_file:
                    outstanding_file.write(files[i])
                break
            first_filename = files[i][:-1]
            second_filename = files[i + 1][:-1]
            first_plaintext = open(first_filename, encoding='ascii').read().strip()
            second_plaintext = open(second_filename, encoding='ascii').read().strip()
            length_of_shorter_string = min(len(first_plaintext), len(second_plaintext))
            if length is None:
                length = length_of_shorter_string
            first_plaintext = first_plaintext[:min(length, length_of_shorter_string)]
            second_plaintext = second_plaintext[:min(length, length_of_shorter_string)]
            cipher = xor_string(first_plaintext, second_plaintext)
            ciphers.append((first_plaintext, second_plaintext, cipher))
    return ciphers

ciphers = createCipherTexts("enron_test_files.txt", 1000)

class SampleDataset(Dataset):
    def __init__(self, input_characters, max_length, letters, num_of_letters):
        self.input_characters = input_characters
        self.max_length = max_length
        self.letters = letters
        self.num_of_letters = num_of_letters
        self.lines = self.getLines()

    def __len__(self):
        if len(self.lines) == 0:
            return 1
        else:
            return len(self.lines)

    def getLines(self):
        indices = self.getLetterIndices(self.input_characters)
        lines = [indices[i: i + self.max_length] for i in range(0, len(indices), self.max_length)]
        return lines

    def getLetterIndices(self, line):
        letter_indices = []
        start_index = line.find(start_of_file)
        end_index = line.find(end_of_file)
        letter_index = 0
        while letter_index < len(line):
            if letter_index == start_index:
                letter_indices.append(self.num_of_letters - 2)
                letter_index += len(start_of_file)
            elif letter_index == end_index:
                letter_indices.append(self.num_of_letters - 1)
                letter_index += len(end_of_file)
            else:
                letter = line[letter_index]
                letter_indices.append(self.letters.find(letter) + 1) #shifted by 1 because of padding marker
                letter_index += 1
        return letter_indices

    # Input tensor contains indices of letters excluding the last letter in the line
    def createInputTensor(self, idx, max_length):
        inp_indices = self.lines[idx]
        inp_length = len(inp_indices)
        while len(inp_indices) < max_length:
            inp_indices.append(0)
        return (inp_indices, inp_length)

    def __getitem__(self, idx):
        return self.createInputTensor(idx, self.max_length), self.createInputTensor(idx, self.max_length)
    
    def numOfLines(self):
        if len(self.lines) == 0:
            return 1
        else:
            return len(self.lines)

def convert_to_hex(character):
    return character.encode('ascii')

softmax = nn.Softmax(dim=2)
prev_batch_size = 1
net.set_params(batch_size=prev_batch_size)
# If length of input_characters increase
# beyond sequence length, batch size is changed to speed up prediction
def getProbabilities(input_characters, model):
    sample_dataset = SampleDataset(input_characters, seq_len, letters, num_of_letters)
    num_of_lines = sample_dataset.numOfLines()
    global prev_batch_size
    if num_of_lines != prev_batch_size:
        prev_batch_size = num_of_lines
        net.set_params(batch_size=prev_batch_size)
    output = net.predict(sample_dataset)
    output = softmax(torch.from_numpy(output))
    character_index = (len(input_characters) - 1) % 150
    if input_characters.find(start_of_file) >= 0:
        character_index -= (len(start_of_file) - 1) 
    if input_characters.find(end_of_file) >= 0:
        character_index -= (len(end_of_file) - 1)
    char_indices = torch.max(output, dim=2)[1][-1]
    probability_next_char = output[-1][character_index]
    return probability_next_char


cipher_to_pairs_dict = {}
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")
printable_chars = bytes(string.printable, 'ascii')


# this function returns all possible pairs of characters that when xored,
# will return cipher_hex
def getPossiblePairs(cipher_hex):
    cipher_hex = cipher_hex.decode('ascii')
    if (cipher_hex in cipher_to_pairs_dict.keys()):
        return cipher_to_pairs_dict[cipher_hex]
    else:
        pairs = []
        visited_letters = set()
        for i in range(len(letters)):
            letter = letters[i]
            xor_byte = xor_string(letter, cipher_hex)
            if xor_byte not in printable_chars:
                continue
            xor = xor_byte.decode('ascii')
            if xor in visited_letters or xor == letter:
                if xor != letter:
                    pairs.append((xor, letter))
                pairs.append((letter, xor))
            visited_letters.add(letter)
        cipher_to_pairs_dict[cipher_hex] = pairs
    return pairs

def getProbabilitiesOfNextCharacters(current_string_pairs, model1, model2):
    probabilities = []
    for pair in current_string_pairs:
        first_string_probability = getProbabilities(pair[0], model1)
        second_string_probability = getProbabilities(pair[1], model2)
        probabilities.append((first_string_probability, second_string_probability))
    return probabilities

def getCost(path, current_cost, probability_pair, char1_index, char2_index):
    prob = probability_pair[0][char1_index] * probability_pair[1][char2_index]
    negative_log = -math.log(prob)
    total_cost = current_cost + negative_log
    return total_cost

def getLowestCostPath(current_paths, probabilities_of_next_letters, char1, char2):
    char1_index, char2_index = letters.find(char1) + 1, letters.find(char2) + 1 # shifted by one for padding marker
    first_path = current_paths[0]
    cost_of_first_path = first_path[1]
    probability_pair = probabilities_of_next_letters[0]
    # arbitrarily set to first path
    minimum = getCost(first_path, cost_of_first_path, probability_pair, char1_index, char2_index) 
    cheapest_path = current_paths[0]
    for index in range(1, len(current_paths)):
        path = current_paths[index]
        cost = path[1]
        probability_pair = probabilities_of_next_letters[index]
        total_cost = getCost(path, cost, probability_pair, char1_index, char2_index)
        if total_cost < minimum:
            cheapest_path = path
            minimum = total_cost
    return createNewPath(cheapest_path, char1, char2, minimum)
            
def createNewPath(current_path, char1, char2, new_cost):
    new_string1 = current_path[0][0] + char1
    new_string2 = current_path[0][1] + char2
    return ((new_string1, new_string2), new_cost)

def getPathsForNextStep(possible_pairs, current_paths, probabilities_of_next_letters, best_k):
    next_step_paths = []
    for pair in possible_pairs:
        char1, char2 = pair[0], pair[1]
        new_path = getLowestCostPath(current_paths, probabilities_of_next_letters, char1, char2)
        next_step_paths.append(new_path)
    next_step_paths = sorted(next_step_paths, key=lambda path: path[1])[:best_k]
    return next_step_paths

# assume ciphertext is in hexadecimal characters
def getPlainText(ciphertext, model1, model2, best_k):
    first_start = start_of_file
    second_start = start_of_file
    current_paths = [((first_start, second_start), 0)] # [((string1, string2), negative log cost]
    print("length of cipher text: " + str(len(ciphertext)))
    for hex_character_index in range(len(ciphertext)):
        start = time.time()
        print("currently at character: " + str(hex_character_index))
        character_hex = ciphertext[hex_character_index: hex_character_index + 1]
        current_string_pairs = list(map(lambda path: path[0], current_paths))
        probabilities_of_next_letters = getProbabilitiesOfNextCharacters(current_string_pairs, model1, model2)
        possible_pairs = getPossiblePairs(character_hex)
        current_paths = getPathsForNextStep(possible_pairs, current_paths, probabilities_of_next_letters, best_k)
        end = time.time()
        time_taken = end - start
        print("time taken: " + str(time_taken))
    return current_paths[0][0][0], current_paths[0][0][1]  

def calculateAccuracy(first_plaintext, second_plaintext, first_original, second_original, stream_switch_match=True):
    num_of_matches = 0
    for i in range(len(first_plaintext)):
        recovered_first = first_plaintext[i]
        recovered_second = second_plaintext[i]
        original_first = first_original[i]
        original_second = second_original[i]
        if recovered_first == original_first and recovered_second == original_second:
            num_of_matches += 2
         # consider switching streams as accurately recovered
        elif recovered_first == original_second and recovered_second == original_first and stream_switch_match:
            num_of_matches += 2
        elif recovered_first == original_first or recovered_first == original_second:
            num_of_matches += 1
        elif recovered_second == original_first or recovered_second == original_second:
            num_of_matches += 1
        else:
            continue
    return (num_of_matches) / (len(first_original) + len(second_original)) * 100

best_k = 50 # prune to best k at each time step
for cipher in ciphers:
    with open("results.txt", 'a+') as f:
        first_original = cipher[0]
        second_original = cipher[1]
        ciphertext = cipher[2]
        first_plaintext, second_plaintext = getPlainText(ciphertext, model1, model2, best_k)
        first_plaintext = first_plaintext.replace(start_of_file, '')
        second_plaintext = second_plaintext.replace(start_of_file, '')
        print(first_plaintext)
        print(second_plaintext)
        f.write("FIRST ORIGINAL:\n" + first_original + "\n")
        f.write("FIRST RECOVERED:\n" + first_plaintext + "\n")
        f.write("SECOND ORIGINAL:\n" + second_original + "\n")
        f.write("SECOND RECOVERED:\n" + second_plaintext + "\n")
        f.write("ACCURACY:\n" + str(calculateAccuracy(first_plaintext, second_plaintext, first_original, second_original)) + "\n")
    f.close()
      


Re-initializing module because the following parameters were re-set: dropout, hidden_dim, input_size, num_of_layers, output_size, padding_idx, seq_len.
Re-initializing optimizer because the following parameters were re-set: lr.
GPU is available
length of cipher text: 1000
currently at character: 0
time taken: 0.010801076889038086
currently at character: 1
time taken: 0.2899327278137207
currently at character: 2
time taken: 0.2963123321533203
currently at character: 3
time taken: 0.2982804775238037
currently at character: 4
time taken: 0.29346203804016113
currently at character: 5
time taken: 0.2854595184326172
currently at character: 6
time taken: 0.2967662811279297
currently at character: 7
time taken: 0.3113548755645752
currently at character: 8
time taken: 0.30075693130493164
currently at character: 9
time taken: 0.2972588539123535
currently at character: 10
time taken: 0.30184483528137207
currently at character: 11
time taken: 0.3139488697052002
currently at character: 12
time take

KeyboardInterrupt: ignored

In [None]:
files.download("/content/results.txt")
files.download("/content/outstanding.txt")