# Some examples of early encryption
First, we need some reference data.

In [1]:
import nltk
from nltk.corpus import gutenberg
from ngram import NGramModel
import numpy as np

print("Creating a spark session...", end="")
from pyspark.sql import SparkSession
spark = SparkSession\
        .builder\
        .appName("asdf")\
        .getOrCreate()
print("done")

nltk.download('gutenberg')

print("Available books:", gutenberg.fileids())

Creating a spark session...done
[nltk_data] Error loading gutenberg: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
Available books: ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [2]:
fileids = spark.sparkContext.parallelize(gutenberg.fileids()[:3])

def make_unigram_models(fileid):
    from nltk.corpus import gutenberg
    return NGramModel(gutenberg.words(fileid), 1)
models = fileids.map(make_unigram_models).collect()

print("Created %i models" % len(models))

for fid, m in zip(fileids.collect(), models):
    print(gutenberg.words(fid), "\t", m)

Created 3 models
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...] 	 1-gram model with 7811 unique keys
['[', 'Persuasion', 'by', 'Jane', 'Austen', '1818', ...] 	 1-gram model with 6132 unique keys
['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', ...] 	 1-gram model with 6833 unique keys


The set of all english words in the corpus will also come in handy.

In [3]:
def word_isalpha(word):
    for c in word:
        if not c.isalpha():
            return False
    return True

english_words = set()
for m in models:
    words = [k[0].lower() for k in list(m.keys())]
    english_words.update(set([w for w in words if word_isalpha(w)]))
    print("We have %i english words so far" % len(english_words))

We have 7079 english words so far
We have 8824 english words so far
We have 10294 english words so far


In [4]:
from random import choices
print(choices(list(english_words), k=100))


['portsmouth', 'example', 'swinging', 'painfully', 'unnoticed', 'uniting', 'estate', 'parliaments', 'odd', 'bragge', 'piece', 'breeding', 'clapped', 'relinquishing', 'smells', 'imprisonment', 'disclaiming', 'prayer', 'happiness', 'demonstrations', 'preceded', 'ride', 'ox', 'scoundrel', 'betimes', 'confusedly', 'disfavour', 'inconsideration', 'gathered', 'zealously', 'gun', 'noise', 'navy', 'prodigiously', 'further', 'ask', 'concerto', 'combine', 'attending', 'frames', 'misunderstanding', 'verses', 'repack', 'hon', 'mama', 'introduces', 'nurses', 'fatiguing', 'stanhill', 'cannot', 'dark', 'endanger', 'founded', 'retreated', 'softener', 'success', 'merged', 'sort', 'quietly', 'bush', 'glibly', 'naval', 'orchestra', 'relating', 'implies', 'bushel', 'dim', 'petted', 'significant', 'outweighs', 'berkeley', 'serviceable', 'assembled', 'extinguished', 'impropriety', 'soldier', 'sea', 'syllables', 'order', 'fanciful', 'memorial', 'yourself', 'contrasted', 'abatement', 'constrained', 'art', 're

Let's clean up some Austen books as trining data for character models

In [5]:
austen_text = [gutenberg.raw(fid) for fid in ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt']]
print(austen_text[0][:600])

def generate_alphabet(alpha, omega):
    """Set of the english alphabet"""
    return set([chr(i) for i in range(ord(alpha), ord(omega)+1)]) 

def clean_text(text, allowed):
    ret = text.lower()
    strip = set(ret).difference(allowed)
    if " " in allowed:
        for s in strip:
            if s in ['\n', '\t']:
                ret = ret.replace(s, " ")
            else:
                ret = ret.replace(s, "")
    else:
        for s in strip:
            ret = ret.replace(s, "")
    return ret

alphabet = generate_alphabet('a', 'z') # Set of the english alphabet
allowed_characters = set([' '])
allowed_characters.update(alphabet)
for i in range(len(austen_text)):
    austen_text[i] = clean_text(austen_text[i], allowed_characters)

print("---")
print(austen_text[0][:600])
print("%i characters in training data" % np.sum([len(t) for t in austen_text]))

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had b
---
emma by jane austen   volume i  chapter i   emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her  she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sister

With the data cleaned, we are ready to create some character ngram models. 

In [6]:
spark_texts = spark.sparkContext.parallelize(austen_text)
unigram_model = spark_texts.map(lambda data: NGramModel(list(data), 1)).reduce(lambda a, b: a.union(b))
print(unigram_model)
bigram_model = spark_texts.map(lambda data: NGramModel(list(data), 2)).reduce(lambda a, b: a.union(b))
print(bigram_model)
trigram_model = spark_texts.map(lambda data: NGramModel(list(data), 3)).reduce(lambda a, b: a.union(b))
print(trigram_model)
quadgram_model = spark_texts.map(lambda data: NGramModel(list(data), 4)).reduce(lambda a, b: a.union(b))
print(quadgram_model)

1-gram model with 27 unique keys
2-gram model with 560 unique keys
3-gram model with 5365 unique keys
4-gram model with 24737 unique keys


In [7]:
print("unigram:", "".join(unigram_model.predict_sequence(90)))
print()
print("bigram:", "".join(bigram_model.predict_sequence(90)))
print()
print("trigram:", "".join(trigram_model.predict_sequence(90)))
print()
print("quadgram:", "".join(trigram_model.predict_sequence(90)))

unigram: o towumciiyhooetl  ptnrlo  os oioht  esbmtse  aaacnnaep ir  nddehrafr rni d  r efk r ueeoa

bigram: hid cofino we sh m thad ave coomumy had then sthes it foung bol by t ad bme d me w   st ju

trigram:  inexprushe dou walf thend it worddess ort ingairced his of hicas spenes be to cal be day 

quadgram: emma dass bod but thearthe alk was of unat  ne el lind   the explad evoin prod elf ling se


We can now find the most common characters in this english text and their probabilities (from relative frequencies).

In [8]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model)):
    print("%s - %.5f" % (unigram, prob))

(' ',) - 0.19206
('e',) - 0.10242
('t',) - 0.06999
('a',) - 0.06374
('o',) - 0.06304
('n',) - 0.05771
('i',) - 0.05516
('h',) - 0.05035
('s',) - 0.05025
('r',) - 0.04912
('d',) - 0.03392
('l',) - 0.03262
('u',) - 0.02331
('m',) - 0.02306
('w',) - 0.01936
('c',) - 0.01901
('f',) - 0.01820
('y',) - 0.01793
('g',) - 0.01563
('b',) - 0.01272
('p',) - 0.01248
('v',) - 0.00908
('k',) - 0.00498
('x',) - 0.00141
('j',) - 0.00124
('q',) - 0.00103
('z',) - 0.00020


## Caesar substitution crypto
Maybe the most basic substitution crypto, based on charcter rotations.

In [9]:
def caesar_encryption(word, offset=3):
    enc = [chr(ord(char)+offset-len(alphabet)) if (ord(char)+offset)>ord('z') else chr(ord(char)+offset)
           for char in word.lower() if char in alphabet]
    return "".join(enc).upper()

import random
offset = random.randint(1, len(alphabet)-1)

print(caesar_encryption("Et tu brute", offset))

UJJKRHKJU


Now for finding the key to an unknown cryptogram (assuming it's a caesar crypto).

In [10]:
message_to_alesia = "YHUFLQJHWRULABRXUPRWKHUZDVDKDPVWHUDQGBRXUIDWKHUVPHOOVRIHOGHUEHUULHV"

In [11]:
caesar_model = NGramModel(list(message_to_alesia), 1)
for unigram, prob in list(ordered_ngrams(caesar_model)):
    print("%s - %.5f" % (unigram, prob))

('H',) - 0.14925
('U',) - 0.14925
('V',) - 0.07463
('D',) - 0.07463
('R',) - 0.07463
('W',) - 0.05970
('L',) - 0.04478
('K',) - 0.04478
('P',) - 0.04478
('O',) - 0.04478
('B',) - 0.02985
('I',) - 0.02985
('G',) - 0.02985
('X',) - 0.02985
('Q',) - 0.02985
('F',) - 0.01493
('E',) - 0.01493
('J',) - 0.01493
('A',) - 0.01493
('Z',) - 0.01493
('Y',) - 0.01493


In [12]:
def caesar_decode(text, offset):
    ret = str()
    for i in range(len(text)):
        c = ord(text[i]) - offset
        if c > ord('Z'):
            c -= len(alphabet)
        if c < ord('A'):
            c += len(alphabet)
        ret += chr(c)
    return ret

In [13]:
n_key = 3
message_from_caesar = caesar_decode(message_to_alesia, n_key)
print(message_from_caesar.capitalize())

Vercingetorixyourmotherwasahamsterandyourfathersmellsofelderberries


Something we can do to solve this, but the roman could not, was let a computer brute force this.

In [14]:
for n in range(1, len(alphabet)):
    print(n, caesar_decode(message_to_alesia, n).lower())

1 xgtekpigvqtkzaqwtoqvjgtycucjcouvgtcpfaqwthcvjgtuognnuqhgnfgtdgttkgu
2 wfsdjohfupsjyzpvsnpuifsxbtbibntufsboezpvsgbuifstnfmmtpgfmefscfssjft
3 vercingetorixyourmotherwasahamsterandyourfathersmellsofelderberries
4 udqbhmfdsnqhwxntqlnsgdqvzrzgzlrsdqzmcxntqezsgdqrldkkrnedkcdqadqqhdr
5 tcpaglecrmpgvwmspkmrfcpuyqyfykqrcpylbwmspdyrfcpqkcjjqmdcjbcpzcppgcq
6 sbozfkdbqlofuvlrojlqebotxpxexjpqboxkavlrocxqebopjbiiplcbiaboyboofbp
7 ranyejcapknetukqnikpdanswowdwiopanwjzukqnbwpdanoiahhokbahzanxanneao
8 qzmxdibzojmdstjpmhjoczmrvnvcvhnozmviytjpmavoczmnhzggnjazgyzmwzmmdzn
9 pylwchaynilcrsiolginbylqumubugmnyluhxsiolzunbylmgyffmizyfxylvyllcym
10 oxkvbgzxmhkbqrhnkfhmaxkptltatflmxktgwrhnkytmaxklfxeelhyxewxkuxkkbxl
11 nwjuafywlgjapqgmjeglzwjoskszseklwjsfvqgmjxslzwjkewddkgxwdvwjtwjjawk
12 mvitzexvkfizopflidfkyvinrjryrdjkvireupfliwrkyvijdvccjfwvcuvisviizvj
13 luhsydwujehynoekhcejxuhmqiqxqcijuhqdtoekhvqjxuhicubbievubtuhruhhyui
14 ktgrxcvtidgxmndjgbdiwtglphpwpbhitgpcsndjgupiwtghbtaahdutastgqtggxth
15 jsfqwbushcfw

## One-to-one substitution crypto

In [15]:
text = """Whats this then Romanes eunt domus People called Romanes they go the house It says Romans go home"""
#def encrypt_substitution(text):
text = text.lower()
unenc = list(set(text))
symbols = [c.upper() for c in alphabet]
symbols.extend(list("0123456789"))
s = list(symbols)
random.shuffle(s)
enc = s[:len(unenc)]

key = dict()
for a, b in zip(unenc, enc):
    key[a] = b

def substitute(text, encryption_key, replace="-"):
    ret = str()
    for c in text:
        if c in encryption_key.keys():
            ret += encryption_key[c]
        else:
            if replace is not None:
                ret += replace
            else:
                ret += c
    return ret

enc_text = substitute(text, key)
print(enc_text)

AHY2832HZ832HSK3LOFYKS83STK23IOFT835SO5NS37YNNSI3LOFYKS832HSW30O32HS3HOT8S3Z238YW83LOFYK830O3HOFS


In [16]:
key

{' ': '3',
 'a': 'Y',
 'c': '7',
 'd': 'I',
 'e': 'S',
 'g': '0',
 'h': 'H',
 'i': 'Z',
 'l': 'N',
 'm': 'F',
 'n': 'K',
 'o': 'O',
 'p': '5',
 'r': 'L',
 's': '8',
 't': '2',
 'u': 'T',
 'w': 'A',
 'y': 'W'}

In [17]:
reverse_key = dict()
for a, b in [(k, key[k]) for k in key.keys()]:
    reverse_key[b] = a
reverse_key

{'0': 'g',
 '2': 't',
 '3': ' ',
 '5': 'p',
 '7': 'c',
 '8': 's',
 'A': 'w',
 'F': 'm',
 'H': 'h',
 'I': 'd',
 'K': 'n',
 'L': 'r',
 'N': 'l',
 'O': 'o',
 'S': 'e',
 'T': 'u',
 'W': 'y',
 'Y': 'a',
 'Z': 'i'}

In [18]:
print(substitute(enc_text, reverse_key))

whats this then romanes eunt domus people called romanes they go the house it says romans go home


## Real problem

In [19]:
intercepted_in_jerusalem = """GPMCPIMEHMIRJXDCRMIRPMDH1PJCJXDH1MRP7IEHCMT4TIPLMRPJPMDQMIRJXDCRMIXMIRPML7EHM7D1EPH2PM2R7LWPJMRPJPM7H1MQE07IPTMGESPTMWP1JXXLMETMRPJPMR7KEHCMCJ7WWP1MRETMGESPMGPMEHSXJLMQE07IPMIR7IMTRPMETMEHMXDJM2DTIX14M7H1MSXJIRGEIRMETTDPMXDJM1PL7H1TMIRP4KPMW0P1MDTMGREIPMIRPMW7TI7J1TMIRP4KPMI75PHMPKPJ4IREHCMGPMR71MHXIMBDTIMSJXLMDTMSJXLMXDJMS7IRPJTM7H1MSJXLMXDJMS7IRPJTMS7IRPJT"""

### Unigram

In [20]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model))[:10]:
    print("%s - %.5f" % (unigram, prob))

(' ',) - 0.19206
('e',) - 0.10242
('t',) - 0.06999
('a',) - 0.06374
('o',) - 0.06304
('n',) - 0.05771
('i',) - 0.05516
('h',) - 0.05035
('s',) - 0.05025
('r',) - 0.04912


In [21]:
m1 = NGramModel(list(intercepted_in_jerusalem), 1)
for unigram, prob in list(ordered_ngrams(m1))[:10]:
    print("%s - %.5f" % (unigram, prob))

('M',) - 0.17500
('P',) - 0.11111
('I',) - 0.07500
('R',) - 0.07222
('J',) - 0.06667
('T',) - 0.05556
('7',) - 0.05556
('E',) - 0.05000
('X',) - 0.04722
('H',) - 0.04444


In [22]:
key = dict()
key['M'] = ' '
key['P'] = 'e'

print(substitute(intercepted_in_jerusalem, key, replace=None))

Ge CeI EH IRJXDCR IRe DH1eJCJXDH1 Re7IEHC T4TIeL ReJe DQ IRJXDCR IX IRe L7EH 7D1EeH2e 2R7LWeJ ReJe 7H1 QE07IeT GESeT We1JXXL ET ReJe R7KEHC CJ7WWe1 RET GESe Ge EHSXJL QE07Ie IR7I TRe ET EH XDJ 2DTIX14 7H1 SXJIRGEIR ETTDe XDJ 1eL7H1T IRe4Ke W0e1 DT GREIe IRe W7TI7J1T IRe4Ke I75eH eKeJ4IREHC Ge R71 HXI BDTI SJXL DT SJXL XDJ S7IReJT 7H1 SJXL XDJ S7IReJT S7IReJT


### Bigram

In [23]:
m2 = NGramModel(list(intercepted_in_jerusalem), 2)
for unigram, prob in list(ordered_ngrams(m2))[:10]:
    print("%s - %.5f" % (unigram, prob))

('P', 'M') - 0.04735
('I', 'R') - 0.03900
('R', 'P') - 0.03621
('T', 'M') - 0.03064
('M', 'I') - 0.02786
('P', 'J') - 0.02507
('E', 'H') - 0.01950
('M', 'R') - 0.01950
('J', 'X') - 0.01950
('X', 'D') - 0.01950


In [24]:
from ngram import ordered_ngrams
for bigram, prob in list(ordered_ngrams(bigram_model))[:10]:
    print("%s - %.5f" % (bigram, prob))

('e', ' ') - 0.03374
(' ', 't') - 0.02382
('h', 'e') - 0.02164
('d', ' ') - 0.02139
(' ', 'a') - 0.02104
('t', ' ') - 0.02060
('t', 'h') - 0.01946
('s', ' ') - 0.01832
('e', 'r') - 0.01641
(' ', 'h') - 0.01599


### Match words

In [25]:
result = list()
for w in english_words:
    if len(w) == 2 and w[1]=='e':
        result.append(w)
print(len(result), "words found")
print(result)

9 words found
['we', 've', 'ye', 're', 'be', 'se', 'me', 'de', 'he']


## MCMC

In [26]:
text = clean_text(gutenberg.raw('melville-moby_dick.txt'), allowed_characters)[:10000]
n_show = text.find("it will be seen")
print(text[n_show:n_show+110])

chars_in_text = set(text)
chars = "".join([e for e in chars_in_text])
symbols = list(chars.upper())
random.shuffle(symbols)
symbols = "".join(symbols)

def substitute(text, symbols, chars):
    ret = text
    for i in range(len(chars)):
        ret = ret.replace(symbols[i], chars[i])
    return ret

print(symbols)
print(chars)
print(substitute(symbols, chars, symbols))


it will be seen that this mere painstaking burrower and grubworm of a poor devil of a subsub appears to have g
YZPSQ WKIFXTMDECLHGNOARUJBV
lavdt ohpwkynzfxesrbugcqmji
YZPSQ WKIFXTMDECLHGNOARUJBV
