# Some examples of early encryption
First, we need some reference data.

In [1]:
import nltk
from nltk.corpus import gutenberg
from ngram import NGramModel
import numpy as np
import os

nltk.download('gutenberg')

print("Available books:", gutenberg.fileids())

Available books: ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/fredrik/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
fileids = gutenberg.fileids()[:3]

def make_unigram_models(fileid):
    from nltk.corpus import gutenberg
    return NGramModel(gutenberg.words(fileid), 1)

from multiprocessing import Pool
with Pool() as process_pool:
    models = process_pool.map(make_unigram_models, fileids)

print("Created %i models" % len(models))

for fid, m in zip(fileids, models):
    print(gutenberg.words(fid), "\t", m)

Created 3 models
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...] 	 1-gram model with 7811 unique keys
['[', 'Persuasion', 'by', 'Jane', 'Austen', '1818', ...] 	 1-gram model with 6132 unique keys
['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', ...] 	 1-gram model with 6833 unique keys


The set of all english words in the corpus will also come in handy.

This merges the words from the models. A word is defined as a key of lower case alphanumeric characters.

In [3]:
def all_isalpha(word):
    for c in word:
        if not c.isalpha():
            return False
    return True

english_words = set()
for m in models:
    words = [k[0].lower() for k in list(m.keys())]
    english_words.update(set([w for w in words if all_isalpha(w)]))

print("We have %i english words in lexicon" % len(english_words))

We have 10294 english words in lexicon


Examples from the lexicon

In [4]:
from random import choices
print(choices(list(english_words), k=20))

['period', 'shocking', 'knoll', 'exit', 'religious', 'entitle', 'speedily', 'immortality', 'passionately', 'lectured', 'merchandise', 'began', 'unsuitableness', 'endeavoring', 'atoned', 'admiring', 'prettiness', 'encouraged', 'merits', 'select']


Let's clean up some Austen books as training data for the character models

In [5]:
austen_text = [gutenberg.raw(fid) for fid in ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt']]
print(austen_text[0][:600])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had b


In [6]:
def generate_alphabet(alpha, omega):
    """Set of the english alphabet"""
    return set([chr(i) for i in range(ord(alpha), ord(omega)+1)]) 

def clean_text(text, allowed):
    ret = text.lower()
    strip = set(ret).difference(allowed)
    if " " in allowed:
        for s in strip:
            if s in ['\n', '\t']:
                ret = ret.replace(s, " ")
            else:
                ret = ret.replace(s, "")
    else:
        for s in strip:
            ret = ret.replace(s, "")
    return ret

alphabet = generate_alphabet('a', 'z') # Set of the english alphabet
allowed_characters = set([' '])
allowed_characters.update(alphabet)
for i in range(len(austen_text)):
    austen_text[i] = clean_text(austen_text[i], allowed_characters)

print("---")
print(austen_text[0][:600])
print("%i characters in training data" % np.sum([len(t) for t in austen_text]))

---
emma by jane austen   volume i  chapter i   emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her  she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period  her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an ex
1948591 characters in training data


With the data cleaned, we are ready to create some character ngram models. 

In [7]:
from sklearn.model_selection import ParameterGrid
grid = {'data': austen_text, 'order': [1, 2, 3, 4]}

def make_model(grid_point):
    chars = list(grid_point['data'])
    return NGramModel(chars, grid_point['order'])

with Pool(processes=os.cpu_count()) as process_pool:
    models = process_pool.map(make_model, ParameterGrid(grid))

print("Created %i models" % len(models))

Created 12 models


In [8]:
model_list = [None]*5
for model in models:
    if model_list[model.order_] is None:
        model_list[model.order_] = model
    else:
        model_list[model.order_] = model_list[model.order_].union(model)

unigram_model = model_list[1]
print(unigram_model)
bigram_model = model_list[2]
print(bigram_model)
trigram_model = model_list[3]
print(trigram_model)
quadgram_model = model_list[4]
print(quadgram_model)

1-gram model with 27 unique keys
2-gram model with 560 unique keys
3-gram model with 5365 unique keys
4-gram model with 24737 unique keys


In [9]:
print("unigram:", "".join(unigram_model.predict_sequence(90)))
print()
print("bigram:", "".join(bigram_model.predict_sequence(90)))
print()
print("trigram:", "".join(trigram_model.predict_sequence(90)))
print()
print("quadgram:", "".join(quadgram_model.predict_sequence(90)))

unigram: a   dwvaoisgemhvdehctgnralsih fcsmeiedhtuev h ehu e dh  au iroe desatf ngwid aoaiowehteb r

bigram: sy ki nd cobe witonghanoffuto o s habed sh winganderorrusand othess wngly he thed of score

trigram: all then of thad afte a ch king muse shosto his expeas but by knight ady in elf em no hein

quadgram: far own unner not eight from hers welliot purprishe green she rely more here excity  emma 


We can now find the most common characters in this english text and their probabilities (from relative frequencies).

In [10]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model))[:10]:
    print("%s - %.5f" % (unigram, prob))

(' ',) - 0.19206
('e',) - 0.10242
('t',) - 0.06999
('a',) - 0.06374
('o',) - 0.06304
('n',) - 0.05771
('i',) - 0.05516
('h',) - 0.05035
('s',) - 0.05025
('r',) - 0.04912


## Caesar substitution crypto
Maybe the most basic substitution crypto, based on charcter rotations.

In [11]:
def caesar_encryption(word, offset=3):
    enc = [chr(ord(char)+offset-len(alphabet)) if (ord(char)+offset)>ord('z') else chr(ord(char)+offset)
           for char in word.lower() if char in alphabet]
    return "".join(enc).upper()

import random
offset = random.randint(1, len(alphabet)-1)

print(caesar_encryption("Et tu brute", offset))

XMMNUKNMX


Now for finding the key to an unknown cryptogram (assuming it's a caesar crypto).

In [12]:
message_to_alesia = "YHUFLQJHWRULABRXUPRWKHUZDVDKDPVWHUDQGBRXUIDWKHUVPHOOVRIHOGHUEHUULHV"

In [13]:
caesar_model = NGramModel(list(message_to_alesia), 1)
for unigram, prob in list(ordered_ngrams(caesar_model)):
    print("%s - %.5f" % (unigram, prob))

('H',) - 0.14925
('U',) - 0.14925
('V',) - 0.07463
('D',) - 0.07463
('R',) - 0.07463
('W',) - 0.05970
('L',) - 0.04478
('K',) - 0.04478
('P',) - 0.04478
('O',) - 0.04478
('B',) - 0.02985
('I',) - 0.02985
('G',) - 0.02985
('X',) - 0.02985
('Q',) - 0.02985
('F',) - 0.01493
('E',) - 0.01493
('J',) - 0.01493
('A',) - 0.01493
('Z',) - 0.01493
('Y',) - 0.01493


In [14]:
def caesar_decode(text, offset):
    ret = str()
    for i in range(len(text)):
        c = ord(text[i]) - offset
        if c > ord('Z'):
            c -= len(alphabet)
        if c < ord('A'):
            c += len(alphabet)
        ret += chr(c)
    return ret

In [15]:
n_key = 3
message_from_caesar = caesar_decode(message_to_alesia, n_key)
print(message_from_caesar.capitalize())

Vercingetorixyourmotherwasahamsterandyourfathersmellsofelderberries


Something we can do to solve this, but the roman could not, was let a computer brute force this.

In [16]:
for n in range(1, len(alphabet)):
    print(n, caesar_decode(message_to_alesia, n).lower())

1 xgtekpigvqtkzaqwtoqvjgtycucjcouvgtcpfaqwthcvjgtuognnuqhgnfgtdgttkgu
2 wfsdjohfupsjyzpvsnpuifsxbtbibntufsboezpvsgbuifstnfmmtpgfmefscfssjft
3 vercingetorixyourmotherwasahamsterandyourfathersmellsofelderberries
4 udqbhmfdsnqhwxntqlnsgdqvzrzgzlrsdqzmcxntqezsgdqrldkkrnedkcdqadqqhdr
5 tcpaglecrmpgvwmspkmrfcpuyqyfykqrcpylbwmspdyrfcpqkcjjqmdcjbcpzcppgcq
6 sbozfkdbqlofuvlrojlqebotxpxexjpqboxkavlrocxqebopjbiiplcbiaboyboofbp
7 ranyejcapknetukqnikpdanswowdwiopanwjzukqnbwpdanoiahhokbahzanxanneao
8 qzmxdibzojmdstjpmhjoczmrvnvcvhnozmviytjpmavoczmnhzggnjazgyzmwzmmdzn
9 pylwchaynilcrsiolginbylqumubugmnyluhxsiolzunbylmgyffmizyfxylvyllcym
10 oxkvbgzxmhkbqrhnkfhmaxkptltatflmxktgwrhnkytmaxklfxeelhyxewxkuxkkbxl
11 nwjuafywlgjapqgmjeglzwjoskszseklwjsfvqgmjxslzwjkewddkgxwdvwjtwjjawk
12 mvitzexvkfizopflidfkyvinrjryrdjkvireupfliwrkyvijdvccjfwvcuvisviizvj
13 luhsydwujehynoekhcejxuhmqiqxqcijuhqdtoekhvqjxuhicubbievubtuhruhhyui
14 ktgrxcvtidgxmndjgbdiwtglphpwpbhitgpcsndjgupiwtghbtaahdutastgqtggxth
15 jsfqwbushcfw

## One-to-one substitution crypto

In [17]:
text = """Whats this then Romanes eunt domus People called Romanes they go the house It says Romans go home"""
#def encrypt_substitution(text):
text = text.lower()
unenc = list(set(text))
symbols = [c.upper() for c in alphabet]
symbols.extend(list("0123456789"))
s = list(symbols)
random.shuffle(s)
enc = s[:len(unenc)]

key = dict()
for a, b in zip(unenc, enc):
    key[a] = b

def substitute(text, encryption_key, replace="-"):
    ret = str()
    for c in text:
        if c in encryption_key.keys():
            ret += encryption_key[c]
        else:
            if replace is not None:
                ret += replace
            else:
                ret += c
    return ret

enc_text = substitute(text, key)
print(enc_text)

FDR0NT0DMNT0DHPT6XURPHNTHVP0TJXUVNTOHXO7HTGR77HJT6XURPHNT0DH3T5XT0DHTDXVNHTM0TNR3NT6XURPNT5XTDXUH


In [18]:
key

{' ': 'T',
 'o': 'X',
 'g': '5',
 'w': 'F',
 'l': '7',
 'r': '6',
 'm': 'U',
 'c': 'G',
 'p': 'O',
 'h': 'D',
 'n': 'P',
 'i': 'M',
 'e': 'H',
 's': 'N',
 'a': 'R',
 'y': '3',
 'd': 'J',
 'u': 'V',
 't': '0'}

In [19]:
reverse_key = dict()
for a, b in [(k, key[k]) for k in key.keys()]:
    reverse_key[b] = a
reverse_key

{'T': ' ',
 'X': 'o',
 '5': 'g',
 'F': 'w',
 '7': 'l',
 '6': 'r',
 'U': 'm',
 'G': 'c',
 'O': 'p',
 'D': 'h',
 'P': 'n',
 'M': 'i',
 'H': 'e',
 'N': 's',
 'R': 'a',
 '3': 'y',
 'J': 'd',
 'V': 'u',
 '0': 't'}

In [20]:
print(substitute(enc_text, reverse_key))

whats this then romanes eunt domus people called romanes they go the house it says romans go home


## Real problem

In [21]:
intercepted_in_jerusalem = """GPMCPIMEHMIRJXDCRMIRPMDH1PJCJXDH1MRP7IEHCMT4TIPLMRPJPMDQMIRJXDCRMIXMIRPML7EHM7D1EPH2PM2R7LWPJMRPJPM7H1MQE07IPTMGESPTMWP1JXXLMETMRPJPMR7KEHCMCJ7WWP1MRETMGESPMGPMEHSXJLMQE07IPMIR7IMTRPMETMEHMXDJM2DTIX14M7H1MSXJIRGEIRMETTDPMXDJM1PL7H1TMIRP4KPMW0P1MDTMGREIPMIRPMW7TI7J1TMIRP4KPMI75PHMPKPJ4IREHCMGPMR71MHXIMBDTIMSJXLMDTMSJXLMXDJMS7IRPJTM7H1MSJXLMXDJMS7IRPJTMS7IRPJT"""

### Unigram

In [22]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model))[:10]:
    print("%s - %.5f" % (unigram, prob))

(' ',) - 0.19206
('e',) - 0.10242
('t',) - 0.06999
('a',) - 0.06374
('o',) - 0.06304
('n',) - 0.05771
('i',) - 0.05516
('h',) - 0.05035
('s',) - 0.05025
('r',) - 0.04912


In [23]:
m1 = NGramModel(list(intercepted_in_jerusalem), 1)
for unigram, prob in list(ordered_ngrams(m1))[:10]:
    print("%s - %.5f" % (unigram, prob))

('M',) - 0.17500
('P',) - 0.11111
('I',) - 0.07500
('R',) - 0.07222
('J',) - 0.06667
('T',) - 0.05556
('7',) - 0.05556
('E',) - 0.05000
('X',) - 0.04722
('H',) - 0.04444


In [24]:
key = dict()
key['M'] = ' '
key['P'] = 'e'

print(substitute(intercepted_in_jerusalem, key, replace=None))

Ge CeI EH IRJXDCR IRe DH1eJCJXDH1 Re7IEHC T4TIeL ReJe DQ IRJXDCR IX IRe L7EH 7D1EeH2e 2R7LWeJ ReJe 7H1 QE07IeT GESeT We1JXXL ET ReJe R7KEHC CJ7WWe1 RET GESe Ge EHSXJL QE07Ie IR7I TRe ET EH XDJ 2DTIX14 7H1 SXJIRGEIR ETTDe XDJ 1eL7H1T IRe4Ke W0e1 DT GREIe IRe W7TI7J1T IRe4Ke I75eH eKeJ4IREHC Ge R71 HXI BDTI SJXL DT SJXL XDJ S7IReJT 7H1 SJXL XDJ S7IReJT S7IReJT


### Bigram

In [25]:
m2 = NGramModel(list(intercepted_in_jerusalem), 2)
for unigram, prob in list(ordered_ngrams(m2))[:10]:
    print("%s - %.5f" % (unigram, prob))

('P', 'M') - 0.04735
('I', 'R') - 0.03900
('R', 'P') - 0.03621
('T', 'M') - 0.03064
('M', 'I') - 0.02786
('P', 'J') - 0.02507
('E', 'H') - 0.01950
('M', 'R') - 0.01950
('J', 'X') - 0.01950
('X', 'D') - 0.01950


In [26]:
from ngram import ordered_ngrams
for bigram, prob in list(ordered_ngrams(bigram_model))[:10]:
    print("%s - %.5f" % (bigram, prob))

('e', ' ') - 0.03374
(' ', 't') - 0.02382
('h', 'e') - 0.02164
('d', ' ') - 0.02139
(' ', 'a') - 0.02104
('t', ' ') - 0.02060
('t', 'h') - 0.01946
('s', ' ') - 0.01832
('e', 'r') - 0.01641
(' ', 'h') - 0.01599


### Match words

In [27]:
result = list()
for w in english_words:
    if len(w) == 2 and w[1]=='e':
        result.append(w)
print(len(result), "words found")
print(result)

9 words found
['re', 'he', 'ye', 'me', 'be', 'we', 'de', 'se', 've']


## MCMC

* Choose the order of the n-gram model
* Choose a reference text representing the language
 * Create an n-gram model from the reference text
* For encrypted message
 * 

In [28]:
n_order = 2
def divergence():
    pass

austen_text
reference_model = copy.deepcopy(quadgram_model)
n_order = reference_model.order_

encrypted_message = copy.deepcopy(intercepted_in_jerusalem)

key = {'symbols': None, 'plain': None}
chars_in_text = set(text)
chars = "".join([e for e in chars_in_text])
symbols = list(chars.upper())
random.shuffle(symbols)
symbols = "".join(symbols)

def substitute(text, symbols, chars):
    ret = text
    for i in range(len(chars)):
        ret = ret.replace(symbols[i], chars[i])
    return ret

print(symbols)
print(chars)
print(substitute(symbols, chars, symbols))


e
TRMIAG PUWSECHNDOLY
 ogwlrmcphniesaydut
TRMIAGTPUWSECHNDOLY


In [29]:
allowed_characters

{' ',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}