# Some examples of early encryption
First, we need some reference data.

In [1]:
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
from ngram import NGramModel
from multiprocessing import Pool
data = [gutenberg.words(book) for book in nltk.corpus.gutenberg.fileids()]
def f(words):
    return NGramModel(words, 1)
models = list(map(f, data[:4]))
print("Created %i models" % len(models))


for d, m in zip(data, models):
    print(d, "\t", m)

austen_raw_text = gutenberg.raw(['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt']).lower()
print(austen_raw_text[:800])

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/fredrik/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
Created 4 models
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...] 	 1-gram model with 7811 unique keys
['[', 'Persuasion', 'by', 'Jane', 'Austen', '1818', ...] 	 1-gram model with 6132 unique keys
['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', ...] 	 1-gram model with 6833 unique keys
['[', 'The', 'King', 'James', 'Bible', ']', 'The', ...] 	 1-gram model with 13769 unique keys
[emma by jane austen 1816]

volume i

chapter i


emma woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

she was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very ea

The preprocesses the text data and creates a unigram model. This model will be used for frequence analysis later.

In [2]:
def generate_alphabet(alpha, omega):
    """Set of the english alphabet"""
    return set([chr(i) for i in range(ord(alpha), ord(omega)+1)]) 

alphabet = generate_alphabet('a', 'z') # Set of the english alphabet

strip = set(austen_raw_text).difference(alphabet)
strip.difference_update(set([' ']))
for s in strip:
    austen_raw_text = austen_raw_text.replace(s, "")

from ngram import character_tokenizer
unigram_model = NGramModel(character_tokenizer(austen_raw_text), 1)
bigram_model = NGramModel(character_tokenizer(austen_raw_text), 2)

print("Removing these characters:", strip)
print(unigram_model)
print(bigram_model)

Removing these characters: {"'", '&', '_', '?', '4', '>', '5', '(', ';', '*', '9', ',', '3', '\n', '2', '-', '7', ':', '[', '"', '1', '!', '0', '.', '8', '6', '`', ']', ')'}
1-gram model with 27 unique keys
2-gram model with 600 unique keys


The set of all english words in the corpus will also come in handy.

In [3]:
engligh_words = set()
for m in models[:1]:
    engligh_words.update(set([k[0].lower() for k in list(m.keys()) if len(k[0])>=2]))
    
print("Found %i english words" % len(engligh_words))

Found 7312 english words


We can now find the most common characters in this english text and their probabilities (from relative frequencies).

In [4]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model)):
    print("%s - %.5f" % (unigram, prob))

(' ',) - 0.17508
('e',) - 0.10457
('t',) - 0.07146
('a',) - 0.06508
('o',) - 0.06436
('n',) - 0.05892
('i',) - 0.05631
('h',) - 0.05141
('s',) - 0.05130
('r',) - 0.05015
('d',) - 0.03463
('l',) - 0.03331
('u',) - 0.02380
('m',) - 0.02354
('w',) - 0.01977
('c',) - 0.01941
('f',) - 0.01858
('y',) - 0.01830
('g',) - 0.01596
('b',) - 0.01298
('p',) - 0.01274
('v',) - 0.00927
('k',) - 0.00509
('x',) - 0.00144
('j',) - 0.00126
('q',) - 0.00106
('z',) - 0.00020


## Caesar substitution crypto
Maybe the most basic substitution crypto, based on charcter rotations.

In [5]:
#alphabet = [chr(i) for i in range(ord('a'), ord('z')+1)]
def caesar_encryption(word, offset=3):
    enc = [chr(ord(char)+offset-len(alphabet)) if (ord(char)+offset)>ord('z') else chr(ord(char)+offset)
           for char in word.lower() if char in alphabet]
    return "".join(enc).upper()

import random
offset = random.randint(1, len(alphabet)-1)

print(caesar_encryption("Et tu brute", offset))

KZZAHXAZK


Now for finding the key to an unknown cryptogram (assuming it's a caesar crypto).

In [6]:
message_to_alesia = "YHUFLQJHWRULABRXUPRWKHUZDVDKDPVWHUDQGBRXUIDWKHUVPHOOVRIHOGHUEHUULHV"
n_key = 10

In [7]:
caesar_model = NGramModel(character_tokenizer(message_to_alesia), 1)
for unigram, prob in list(ordered_ngrams(caesar_model)):
    print("%s - %.5f" % (unigram, prob))

('H',) - 0.14925
('U',) - 0.14925
('V',) - 0.07463
('D',) - 0.07463
('R',) - 0.07463
('W',) - 0.05970
('L',) - 0.04478
('K',) - 0.04478
('P',) - 0.04478
('O',) - 0.04478
('B',) - 0.02985
('I',) - 0.02985
('G',) - 0.02985
('X',) - 0.02985
('Q',) - 0.02985
('F',) - 0.01493
('E',) - 0.01493
('J',) - 0.01493
('A',) - 0.01493
('Z',) - 0.01493
('Y',) - 0.01493


In [8]:
def caesar_decode(text, offset):
    ret = str()
    for i in range(len(text)):
        c = ord(text[i]) - offset
        if c > ord('Z'):
            c -= len(alphabet)
        if c < ord('A'):
            c += len(alphabet)
        ret += chr(c)
    return ret

message_from_caesar = caesar_decode(message_to_alesia, n_key)
print(message_from_caesar.capitalize())

Oxkvbgzxmhkbqrhnkfhmaxkptltatflmxktgwrhnkytmaxklfxeelhyxewxkuxkkbxl


Something we can do to solve this, but the roman could not, was let a computer brute force this.

In [9]:
for n in range(1, len(alphabet)):
    print(n, caesar_decode(message_to_alesia, n).lower())

1 xgtekpigvqtkzaqwtoqvjgtycucjcouvgtcpfaqwthcvjgtuognnuqhgnfgtdgttkgu
2 wfsdjohfupsjyzpvsnpuifsxbtbibntufsboezpvsgbuifstnfmmtpgfmefscfssjft
3 vercingetorixyourmotherwasahamsterandyourfathersmellsofelderberries
4 udqbhmfdsnqhwxntqlnsgdqvzrzgzlrsdqzmcxntqezsgdqrldkkrnedkcdqadqqhdr
5 tcpaglecrmpgvwmspkmrfcpuyqyfykqrcpylbwmspdyrfcpqkcjjqmdcjbcpzcppgcq
6 sbozfkdbqlofuvlrojlqebotxpxexjpqboxkavlrocxqebopjbiiplcbiaboyboofbp
7 ranyejcapknetukqnikpdanswowdwiopanwjzukqnbwpdanoiahhokbahzanxanneao
8 qzmxdibzojmdstjpmhjoczmrvnvcvhnozmviytjpmavoczmnhzggnjazgyzmwzmmdzn
9 pylwchaynilcrsiolginbylqumubugmnyluhxsiolzunbylmgyffmizyfxylvyllcym
10 oxkvbgzxmhkbqrhnkfhmaxkptltatflmxktgwrhnkytmaxklfxeelhyxewxkuxkkbxl
11 nwjuafywlgjapqgmjeglzwjoskszseklwjsfvqgmjxslzwjkewddkgxwdvwjtwjjawk
12 mvitzexvkfizopflidfkyvinrjryrdjkvireupfliwrkyvijdvccjfwvcuvisviizvj
13 luhsydwujehynoekhcejxuhmqiqxqcijuhqdtoekhvqjxuhicubbievubtuhruhhyui
14 ktgrxcvtidgxmndjgbdiwtglphpwpbhitgpcsndjgupiwtghbtaahdutastgqtggxth
15 jsfqwbushcfw

## One-to-one substitution crypto

In [10]:
intercepted_in_jerusalem = """GPMCPIMEHMIRJXDCRMIRPMDH1PJCJXDH1MRP7IEHCMT4TIPLMRPJPMDQMIRJXDCRMIXMIRPML7EHM7D1EPH2PM2R7LWPJMRPJPM7H1MQE07IPTMGESPTMWP1JXXLMETMRPJPMR7KEHCMCJ7WWP1MRETMGESPMGPMEHSXJLMQE07IPMIR7IMTRPMETMEHMXDJM2DTIX14M7H1MSXJIRGEIRMETTDPMXDJM1PL7H1TMIRP4KPMW0P1MDTMGREIPMIRPMW7TI7J1TMIRP4KPMI75PHMPKPJ4IREHCMGPMR71MHXIMBDTIMSJXLMDTMSJXLMXDJMS7IRPJTM7H1MSJXLMXDJMS7IRPJTMS7IRPJT"""

text = """Whats this then Romanes eunt domus People called Romanes they go the house It says Romans go home"""
#def encrypt_substitution(text):
text = text.lower()
unenc = list(set(text))
symbols = [c.upper() for c in alphabet]
symbols.extend(list("0123456789"))
s = list(symbols)
random.shuffle(s)
enc = s[:len(unenc)]

key = dict()
for a, b in zip(unenc, enc):
    key[a] = b

def substitute(text, encryption_key, replace="-"):
    ret = str()
    for c in text:
        if c in encryption_key.keys():
            ret += encryption_key[c]
        else:
            if replace is not None:
                ret += replace
            else:
                ret += c
    return ret

enc_text = substitute(text, key)
print(enc_text)

BH8LCXLH0CXLH9YXEWQ8Y9CX9UYLXJWQUCXZ9WZF9X38FF9JXEWQ8Y9CXLH97XGWXLH9XHWUC9X0LXC87CXEWQ8YCXGWXHWQ9


In [11]:
key

{'s': 'C',
 'm': 'Q',
 'p': 'Z',
 'u': 'U',
 'c': '3',
 't': 'L',
 'w': 'B',
 'h': 'H',
 'n': 'Y',
 ' ': 'X',
 'r': 'E',
 'l': 'F',
 'y': '7',
 'o': 'W',
 'e': '9',
 'i': '0',
 'g': 'G',
 'd': 'J',
 'a': '8'}

In [12]:
reverse_key = dict()
for a, b in [(k, key[k]) for k in key.keys()]:
    reverse_key[b] = a
reverse_key

{'C': 's',
 'Q': 'm',
 'Z': 'p',
 'U': 'u',
 '3': 'c',
 'L': 't',
 'B': 'w',
 'H': 'h',
 'Y': 'n',
 'X': ' ',
 'E': 'r',
 'F': 'l',
 '7': 'y',
 'W': 'o',
 '9': 'e',
 '0': 'i',
 'G': 'g',
 'J': 'd',
 '8': 'a'}

In [13]:
print(substitute(enc_text, reverse_key))

whats this then romanes eunt domus people called romanes they go the house it says romans go home


In [14]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model))[:15]:
    print("%s - %.5f" % (unigram, prob))

(' ',) - 0.17508
('e',) - 0.10457
('t',) - 0.07146
('a',) - 0.06508
('o',) - 0.06436
('n',) - 0.05892
('i',) - 0.05631
('h',) - 0.05141
('s',) - 0.05130
('r',) - 0.05015
('d',) - 0.03463
('l',) - 0.03331
('u',) - 0.02380
('m',) - 0.02354
('w',) - 0.01977


In [15]:
m1 = NGramModel(character_tokenizer(intercepted_in_jerusalem), 1)
for unigram, prob in list(ordered_ngrams(m1))[:15]:
    print("%s - %.5f" % (unigram, prob))

('M',) - 0.17500
('P',) - 0.11111
('I',) - 0.07500
('R',) - 0.07222
('J',) - 0.06667
('T',) - 0.05556
('7',) - 0.05556
('E',) - 0.05000
('X',) - 0.04722
('H',) - 0.04444
('D',) - 0.04167
('1',) - 0.03889
('S',) - 0.02778
('L',) - 0.02500
('C',) - 0.02222


In [16]:
#key = {'': ''}

#m1_sorted_keys = [k[0][0] for k in ordered_ngrams(m1)]
#unigram_sorted_keys = [k[0][0] for k in ordered_ngrams(unigram_model)]
#for enc, ref in list(zip(m1_sorted_keys[:5], unigram_sorted_keys[:5])):
#    print(enc, ref)
#    key[enc] = ref
print(substitute(intercepted_in_jerusalem, key, replace=None))

GPMCPIMEHMIRJXDCRMIRPMDH1PJCJXDH1MRP7IEHCMT4TIPLMRPJPMDQMIRJXDCRMIXMIRPML7EHM7D1EPH2PM2R7LWPJMRPJPM7H1MQE07IPTMGESPTMWP1JXXLMETMRPJPMR7KEHCMCJ7WWP1MRETMGESPMGPMEHSXJLMQE07IPMIR7IMTRPMETMEHMXDJM2DTIX14M7H1MSXJIRGEIRMETTDPMXDJM1PL7H1TMIRP4KPMW0P1MDTMGREIPMIRPMW7TI7J1TMIRP4KPMI75PHMPKPJ4IREHCMGPMR71MHXIMBDTIMSJXLMDTMSJXLMXDJMS7IRPJTM7H1MSJXLMXDJMS7IRPJTMS7IRPJT


In [17]:
m2 = NGramModel(character_tokenizer(intercepted_in_jerusalem), 2)
for unigram, prob in list(ordered_ngrams(m2))[:10]:
    print("%s - %.5f" % (unigram, prob))

('P', 'M') - 0.04735
('I', 'R') - 0.03900
('R', 'P') - 0.03621
('T', 'M') - 0.03064
('M', 'I') - 0.02786
('P', 'J') - 0.02507
('E', 'H') - 0.01950
('M', 'R') - 0.01950
('J', 'X') - 0.01950
('X', 'D') - 0.01950


In [18]:
from ngram import ordered_ngrams
for bigram, prob in list(ordered_ngrams(bigram_model))[:10]:
    print("%s - %.5f" % (bigram, prob))

('e', ' ') - 0.03101
('h', 'e') - 0.02211
(' ', 't') - 0.02183
('t', 'h') - 0.02001
('d', ' ') - 0.01981
('t', ' ') - 0.01905
(' ', 'a') - 0.01876
('e', 'r') - 0.01679
('s', ' ') - 0.01675
('i', 'n') - 0.01549


In [19]:
result = list()
for w in engligh_words:
    if len(w) == 3 and w[0]=='a':
        result.append(w)
print(len(result), "words found")
print(result)

17 words found
['any', 'age', 'ate', 'ask', 'are', 'arm', 'air', 'and', 'all', 'add', 'apt', 'ago', 'aid', 'awe', 'act', 'aye', 'art']
