In [98]:
import os
from typing import List, Dict
from collections import Counter

letters = [1,3,4,6,10,12,13,14]
numbers = [2,5,8,11,17,18,19,20]

# src: http://pi.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html

letter_freq_ref = {' ': 18.2884627, 'e': 10.2666504, 't': 7.5169982, 'a': 6.5321669, 'o': 6.1595773, 'n': 5.7120111, 'i': 5.6684433, 's': 5.3170053, 'r': 4.9879086, 'h': 4.978563, 'l': 3.317548, 'd': 3.2829231, 'u': 2.2757954, 'c': 2.233676, 'm': 2.0265678, 'f': 1.9830672, 'w': 1.7038938, 'g': 1.6249044, 'p': 1.5043243, 'y': 1.4276666, 'b': 1.2588807, 'v': 0.7961164, 'k': 0.5609627, 'x': 0.1409202, 'j': 0.0975218, 'q': 0.0836755, 'z': 0.0512846}
letter_freq_ref_list = [' ', 'e','t', 'a','o','i','n','s','r','h','d','l','u','c','m','f','y','w','g','p','b','v','k','x','q','j','z']


In [8]:
# return ctxts (in bytearray) given the file number
def ctxt(file_no:int) -> bytearray : 
    path = 'ctxts/%02d.txt' % file_no
    ctxtfile = open(path)
    ctxt_str = ctxtfile.read()
    ctxt_bytes = ctxt_str.encode('utf-8')
    return ctxt_bytes

## Group 1: Text - Shift
### CTXT: 1

While screening for potential shift cipher encoded ciphertext using exhaustive key search, we are able to decode the message in 1 by shifting each alphabet by 12.

In [10]:
# perform shift on a byte, ignore non letters
def shift_letter(l:int, shift:int) -> int:
    if l<97 or l>122: # a=97, z=122
        return l
    elif l+shift > 122:
        return l+shift-26
    elif l+shift < 97:
        return l+shift+26
    else: 
        return l+shift

# decrypt shift ciper
def shift(ctxt:bytearray, shift:int, limit:int = -1):
    limit = len(ctxt) if limit<0 else limit
    for i in range(limit):
        print(chr(shift_letter(ctxt[i], shift)), end='')
    print()


shift(ctxt(1), 12)

hey alex, you've got to stop reusing that one time pad. the students are onto us. let's switch to a stronger cipher. david



## Group 7 Text - Substitution
### CTXT: 10, 12
First we observe that the punctuation seems to be unmodified. Because t is the most common letter and it often appears after commas, we then substitute t for space. The resulting text has reasonable spacing in terms of word length and space placement. Since the letter frequency of 10 and 12 are very similar, we group these two ciphertexts together.

We then matched the second most frequent letter in these sequences, `s` with the most frequent letter `e` in our reference text. While the results are not immediately clear, we observed that the sequence `mvs` is the most common trigram in the sequences. We also noticed that `mvsm` appears in the text. These observations led us to predict `m` to be `s` since it is very possible that the latter is a plural form of the former. Together, we assumed the `mvs` and `mvsm` sequences to be `s*e` and `s*es`. This leaves 2 possibilities for the substitution of `v`, `a` and `h`. We choose `h` because `h` has a more similar reference frequency with `v`.

After that we simply look for short phrases with most letters guessed, and test out potential substitutions one by one. However, we note that the word `scum` is substituted to `skum` in order to make sense of 

In [184]:
s10 = ctxt(10).decode()
s12 = ctxt(12).decode()

def tally_frequency(s):
    d, l = {}, 0
    for c in s:
        if c.isalpha() or c==' ':
            d[c] = d.get(c, 0)+1
            l+=1
    return {k:v/l*100 for (k, v) in d.items()}

def apply_transformation(s, d, colored=False):
    res, i = "", 0
    while i < len(s):
        replacement = d.get(s[i], None)
        if replacement != None:
            if colored:
                res+="\x1b[31m"+replacement+"\x1b[0m"
            else:
                res+=replacement
        else:
            res+=s[i]
        i+=1
    return res

def rank_frequency(d):
    return [s[0] for s in sorted(d.items(), key=lambda x: x[1], reverse=True)]

# freq_d_10 = tally_frequency(s10)
# freq_d_12 = tally_frequency(s12)
# freq_l_10 = rank_frequency(freq_d_10)
# freq_l_12 = rank_frequency(freq_d_12)

# for i, j, k in zip(freq_l_10,freq_l_12,letter_freq_ref_list):
#     print(i, round(freq_d_10[i],4),"\t",
#           j, round(freq_d_12[j],4),"\t",
#           k, round(letter_freq_ref[k],4))


d = {
    't':' ',
    's':'e',
    ' ':'c',
    'l':'a',
    'u':'t', 
    'r':'o',
    'm':'s',
    'v':'h',
    'o':'p',
    'd':'m',
    'x':'y',
    'p':'b',
    'a':'u',
    'w':'w',
    'f':'v',
    'q':'r',
    'e':'n',
    'y':'i',
    'i':'g',
    'c':'d',
    'h':'k',
    'z':'f',
    'n':'l',
    'j':'h',
    'g':'j',
    'b':'q'
}

s10_1 = apply_transformation(s10, d)
s12_1 = apply_transformation(s12, d)
print(s10_1)
print(s12_1)


my pet bat has the coronavirus, but shes asymptomatic.
however, bats arent in favour right now so shes heeping her little head down.
hidden beneath her folded wings, she tuchs her furrowed snout  patient hero in a world frothing to place blame.
she tells everyone that she cant infect humans, but no ones listening to facts.
she posts articles daily on social media. headlines such as:
bats not to blame: intermediate host required for human infection from sarscov2 virus.
she writes were just a natural reservoir of the disease, the pangolins are the ones being irresponsible! bats are victims too! with a picture of her wearing a face mash, loohing mournful.
the pathos is wellintended but not well received.
dirty bat!
goddamn bat shum!
angry comments flow beneath all her posts.
she hangs around our bedroom, sullen  our housemates isolating her.
we watch true blood reruns and eat dry toast.

we had decided it would be best for me to stay home with lucas while my husband nate went to worh; his

In [19]:
def get_bigrams(l:int) -> Dict[str, float]:
    bigrams = {}
    s = 0
    with open('bigrams.txt', 'r') as f:
        for i in range(l):
            line = f.readline()
            toks = line.split()
            s += int(toks[1])
            bigrams[toks[0].lower()] = int(toks[1])
    for k in bigrams.keys():
        bigrams[k] = bigrams[k]/s
    return bigrams
get_bigrams(20)

{'th': 0.09875606809612637,
 'he': 0.08499024745598541,
 'in': 0.07400426722198333,
 'er': 0.06510793721416461,
 'an': 0.05889614793878943,
 're': 0.051424667195244765,
 'es': 0.048172285488823026,
 'on': 0.04804128272842526,
 'st': 0.045596093976635374,
 'nt': 0.042795999762623174,
 'en': 0.041352777310769265,
 'at': 0.04074781181176989,
 'ed': 0.039374820567679686,
 'nd': 0.0389918982523242,
 'to': 0.03892511597387786,
 'or': 0.038595925546323614,
 'ea': 0.036574021543099695,
 'ti': 0.03620165872499343,
 'ar': 0.03574973250075516,
 'te': 0.035701240689606444}

In [146]:


def tally_frequency(s:str) -> Dict[str, float] :
    d, l, i = {}, len(s), 0
    while i < l:
        if s[i]=='\\':
            i+=1
        elif s[i].isalpha() or s[i]==' ':
            d[s[i]] = d.get(s[i], 0)+1
        i+=1
    # calculate frequency
    for k in d.keys():
        d[k] = d[k]/l
    return d


def create_transformation(d: Dict[str, float]) -> Dict[str, str]:
    sorted_pair = sorted(d.items(), key=lambda x: x[1], reverse=True)
    sorted_letters = [pair[0] for pair in sorted_pair]
    return {m:c for (m, c) in zip(sorted_letters, ref_list)}

def apply_transformation(s:str, d: Dict[str, str]) -> str :
    res, i = "", 0
    while i < len(s):
        if s[i] == '\\':
            res+='\n'
            i+=1
        else:
            res+=d.get(s[i], s[i])
        i+=1
    return res

def substitution(s: str) -> str :
    freq_d = tally_frequency(s)
    k = create_transformation(freq_d)
    return apply_transformation(s, k)


i = 14
s = ctxt(i).decode('utf-8')
print(s)
freq = tally_frequency(s)
print([p[0] for p in sorted(freq.items(), key=lambda x: x[1], reverse=True)])
print(onegram_list)
print()
d = {
    'q':'s',
    'j':'d',
    'n':'a',
    'f':'f'
}
print()
print(apply_transformation(s, d))

k aqx qqkrankp vlu tqqgv tfv bqquyerpk y bxwvit yl bjehj trvl otr pwci. n tju xxnktkrw np cjihj xwa aqd qqcx otr lcr swbjvi q kfug wkhe cjej, yejv jyqb lqybi ejxi yyp xyr xfpq pycgba. hsh jujotbj vxw ghjxcg e jjuc hmbj, fwumtj qqcx jjuc hmbj vxw lqab cji xfpq vlqy zxoii trc kj otr ywx jmxc uecj crni jmoxwkx f ejul vzklvmes. f mqr'j yerpo jmxc'u texprdpu (n jncr cfvkg cez zjp he nq ka gxfklg) fo hxw ask uoxxi jmxc uysm x okpu juruxi to mqiis'q? ru xxjon cro bxh ask hxw oeaj lwg rey yh elqszn (k hes'q cjmdp px). k aezim nmaj qx jiqw vxwv eufwksdx. qqcra dld.

[' ', 'j', 'x', 'q', 'c', 'k', 'r', 'e', 'u', 'w', 'y', 'i', 'p', 'v', 't', 'h', 'o', 'm', 'a', 'l', 'b', 'n', 'f', 's', 'g', 'd', 'z']
[' ', 'e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'd', 'l', 'u', 'c', 'm', 'f', 'y', 'w', 'g', 'p', 'b', 'v', 'k', 'x', 'q', 'j', 'z']


k asx sskraakp vlu tssgv tfv bssuyerpk y bxwvit yl bdehd trvl otr pwci. a tdu xxaktkrw ap cdihd xwa asd sscx otr lcr swbdvi s kfug wkhe cded, yedv dysb lsybi edxi

In [151]:
def tally_bigram_frequency(s:str) -> Dict[str, float] :
    d, l, i = {}, len(s), 0
    while i < l-1:
        if s[i].isalpha() and s[i+1].isalpha():
            d[s[i:i+2]] = d.get(s[i:i+2], 0)+1
        i+=1
    # calculate frequency
    for k in d.keys():
        d[k] = d[k]/l
    return d

def apply_bigram_transformation(s:str, d: Dict[str, str]) -> str :
    res, i = s, 0
    for k in d.keys():
        res = res.replace(k, d[k])
    return res

i = 14
limit = 15
s = ctxt(i).decode("utf-8")
freq = tally_bigram_frequency(s)
sorted_freq = [p[0] for p in sorted(freq.items(), key=lambda x:x[1], reverse=True)]
print(sorted_freq[:limit])
print(list(get_bigrams(limit).keys()))
d = {
    
}
print()
print(ctxt(4))
print()
print(ctxt(14))

['xw', 'qq', 'ju', 'bj', 'tr', 'cj', 'cr', 'jm', 'ot', 'ej', 'lq', 'xc', 'vl', 'ye', 'hj']
['th', 'he', 'in', 'er', 'an', 're', 'es', 'on', 'st', 'nt', 'en', 'at', 'ed', 'nd', 'to']

b"vlyx mxux fwbbgrjx qqg hykcrgluqivcr ajv nzgxfkpg (hxpb)  jp mculavedy mjtx ek qxfeo'x macgjnzjn ghdmcqkhfmqa. axjknxih dld'ti qhznuwysd jp ljymb yirxfcg, mj'x sntc bnhnnc jmxc askw yaqaijo jph jmb bgvljo wgkeyfjvit f pqcvui pnevuy hna yinkp vlu ietg ydiba vlu mlxf.\nxxj rwfihxqjph jmb vcxx gbqkrt ietg, cez pqqybi yn hecnircv mnqq deinz ptsku qqgshd. x ptsku fb c wuy trvl q gfwcvo tmntejnlw, uysm qqcx qsv cys yybvu md yen uij hlvdmdja fkxx yen qtuwxcksd uoxfysj xwqxxjo rvic nk cji ijq, cji eubacxytk ru eixllkejnsn, vlu xbc jei fk rfidyfca ibjjnpx m.w.q cji eubacxytk jph ufzq uij jinoidy eju ed nkegvij.\n"

b"k aqx qqkrankp vlu tqqgv tfv bqquyerpk y bxwvit yl bjehj trvl otr pwci. n tju xxnktkrw np cjihj xwa aqd qqcx otr lcr swbjvi q kfug wkhe cjej, yejv jyqb lqybi ejxi yyp xyr xfpq pycgba. hsh jujotbj vxw

In [155]:
# tally_number_frequency(str(ctxt(5)))
for i in letters:
    s = ctxt(i).decode('utf-8')
    freq = tally_frequency(s)
    print(i, end=': ')
    print([p[0] for p in sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10]])


1: [' ', 'h', 's', 'c', 'g', 'o', 'v', 'f', 'w', 'b']
3: ['f', 'e', 'b', 'r', 't', 'k', 'x', 'n', ' ', 'o']
4: [' ', 'j', 'x', 'i', 'c', 'q', 'n', 'y', 'u', 'e']
6: ['e', ' ', 'x', 'k', 'b', 'r', 'f', 's', 't', 'o']
10: ['t', 's', 'u', 'm', 'l', 'r', 'q', 'e', 'y', 'v']
12: ['t', 's', 'e', 'r', 'l', 'u', 'y', 'v', 'c', 'q']
13: ['e', 'b', 'r', 'f', 'y', ' ', 'z', 'k', 'x', 'g']
14: [' ', 'j', 'x', 'q', 'c', 'k', 'r', 'e', 'u', 'w']


In [159]:
def tally_number_frequency(s:str, window:int) -> Dict[str, float] :
    d, l, i = {}, len(s), 0
    while i < l:
        d[s[i:i+window]] = d.get(s[i:i+window], 0)+1
        i+=1
    # calculate frequency
    for k in d.keys():
        d[k] = d[k]/l
    return d

# tally_number_frequency(str(ctxt(5)))
for i in numbers:
    s = ctxt(i).decode('utf-8')
    freq = tally_number_frequency(s,4)
    print(i, end=': ')
    print([p[0] for p in sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10]])

2: ['5740', '7406', '7407', '0675', '4067', '7400', '2740', '2357', '4235', '6787']
5: ['5740', '7407', '7406', '2357', '4067', '0675', '3574', '7400', '2740', '7574']
8: ['0000', '9703', '7721', '1146', '7000', '7723', '7323', '7772', '0003', '7181']
11: ['4974', '7497', '1772', '9749', '7721', '7772', '1593', '1711', '7484', '9703']
17: ['7406', '5740', '4067', '1740', '0675', '7400', '2357', '0370', '8740', '7407']
18: ['0000', '7721', '7484', '1463', '7772', '1711', '9703', '1772', '0094', '1686']
19: ['5740', '7407', '7406', '2357', '4235', '0675', '4067', '7423', '7400', '3574']
20: ['7916', '1685', '5314', '3954', '9548', '5484', '4841', '8415', '4159', '1598']
