In [2]:
import os
from typing import List, Dict
from collections import Counter
import string

letters = [1,3,4,6,10,12,13,14]
numbers = [2,5,8,11,17,18,19,20]

# src: http://pi.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html

letter_freq_ref = {' ': 18.2884627, 'e': 10.2666504, 't': 7.5169982, 'a': 6.5321669, 'o': 6.1595773, 'n': 5.7120111, 'i': 5.6684433, 's': 5.3170053, 'r': 4.9879086, 'h': 4.978563, 'l': 3.317548, 'd': 3.2829231, 'u': 2.2757954, 'c': 2.233676, 'm': 2.0265678, 'f': 1.9830672, 'w': 1.7038938, 'g': 1.6249044, 'p': 1.5043243, 'y': 1.4276666, 'b': 1.2588807, 'v': 0.7961164, 'k': 0.5609627, 'x': 0.1409202, 'j': 0.0975218, 'q': 0.0836755, 'z': 0.0512846}
letter_freq_ref_list = [' ', 'e','t', 'a','o','i','n','s','r','h','d','l','u','c','m','f','y','w','g','p','b','v','k','x','q','j','z']


In [3]:
# return ctxts (in bytearray) given the file number
def ctxt(file_no:int) -> bytearray : 
    path = 'ctxts/%02d.txt' % file_no
    ctxtfile = open(path)
    ctxt_str = ctxtfile.read()
    ctxt_bytes = ctxt_str.encode('utf-8')
    return ctxt_bytes

## Group 1: Text - Shift
### CTXT: 1

While screening for potential shift cipher encoded ciphertext using exhaustive key search, we are able to decode the message in 1 by shifting each alphabet by 12.

In [10]:
# perform shift on a byte, ignore non letters
def shift_letter(l:int, shift:int) -> int:
    if l<97 or l>122: # a=97, z=122
        return l
    elif l+shift > 122:
        return l+shift-26
    elif l+shift < 97:
        return l+shift+26
    else: 
        return l+shift

# decrypt shift ciper
def shift(ctxt:bytearray, shift:int, limit:int = -1):
    limit = len(ctxt) if limit<0 else limit
    for i in range(limit):
        print(chr(shift_letter(ctxt[i], shift)), end='')
    print()


shift(ctxt(1), 12)

hey alex, you've got to stop reusing that one time pad. the students are onto us. let's switch to a stronger cipher. david



## Group 7 Text - Substitution
### CTXT: 10, 12
First we observe that the punctuation seems to be unmodified. Because t is the most common letter and it often appears after commas, we then substitute t for space. The resulting text has reasonable spacing in terms of word length and space placement. Since the letter frequency of 10 and 12 are very similar, we group these two ciphertexts together.

We then matched the second most frequent letter in these sequences, `s` with the most frequent letter `e` in our reference text. While the results are not immediately clear, we observed that the sequence `mvs` is the most common trigram in the sequences. We also noticed that `mvsm` appears in the text. These observations led us to predict `m` to be `s` since it is very possible that the latter is a plural form of the former. Together, we assumed the `mvs` and `mvsm` sequences to be `s*e` and `s*es`. This leaves 2 possibilities for the substitution of `v`, `a` and `h`. We choose `h` because `h` has a more similar reference frequency with `v`.

After that we simply look for short phrases with most letters guessed, and test out potential substitutions one by one. However, in order to make sense of the most words contain the substitution, the word `scum` is substituted to `skum`. We assume that this is a misspell.

In [23]:
s10 = ctxt(10).decode()
s12 = ctxt(12).decode()

def tally_frequency(s):
    d, l = {}, 0
    for c in s:
        if c.isalpha() or c==' ':
            d[c] = d.get(c, 0)+1
            l+=1
    return {k:v/l*100 for (k, v) in d.items()}

def apply_transformation(s, d, colored=False):
    res, i = "", 0
    while i < len(s):
        replacement = d.get(s[i], None)
        if replacement != None:
            if colored:
                res+="\x1b[31m"+replacement+"\x1b[0m"
            else:
                res+=replacement
        else:
            res+=s[i]
        i+=1
    return res

def rank_frequency(d):
    return [s[0] for s in sorted(d.items(), key=lambda x: x[1], reverse=True)]


d = {
    't':' ',
    's':'e',
    ' ':'c',
    'l':'a',
    'u':'t', 
    'r':'o',
    'm':'s',
    'v':'h',
    'o':'p',
    'd':'m',
    'x':'y',
    'p':'b',
    'a':'u',
    'w':'w',
    'f':'v',
    'q':'r',
    'e':'n',
    'y':'i',
    'i':'g',
    'c':'d',
    'h':'k',
    'z':'f',
    'n':'l',
    'j':'h',
    'g':'j',
    'b':'q'
}

s10_1 = apply_transformation(s10, d)
s12_1 = apply_transformation(s12, d)
print(s10_1)
print(s12_1)


my pet bat has the coronavirus, but shes asymptomatic.
however, bats arent in favour right now so shes keeping her little head down.
hidden beneath her folded wings, she tucks her furrowed snout  patient hero in a world frothing to place blame.
she tells everyone that she cant infect humans, but no ones listening to facts.
she posts articles daily on social media. headlines such as:
bats not to blame: intermediate host required for human infection from sarscov2 virus.
she writes were just a natural reservoir of the disease, the pangolins are the ones being irresponsible! bats are victims too! with a picture of her wearing a face mask, looking mournful.
the pathos is wellintended but not well received.
dirty bat!
goddamn bat skum!
angry comments flow beneath all her posts.
she hangs around our bedroom, sullen  our housemates isolating her.
we watch true blood reruns and eat dry toast.

we had decided it would be best for me to stay home with lucas while my husband nate went to work; his

In [863]:
b7 = ctxt(7)
b9 = ctxt(9)
b15 = ctxt(15)
b16 = ctxt(16)

def ba_bs(c):
    res = []
    for i in range(0, len(c), 2):
        res.append(int(c[i:i+2], 16))
    return res

def xor(bs1, bs2):
    res = []
    for byte1, byte2 in zip(bs1, bs2):
        res.append(byte1^byte2)
    return res

def str_bs(s):
    return [ord(c) for c in s]

def bs_str(bs):
    return ''.join([chr(b) for b in bs])

def bs_hex(bs):
    return ["%02x" % b for b in bs]

    
def crib(guess, bs1, bs2):
    l = len(guess)
    bs12 = xor(bs1, bs2)
    for i in range(len(bs1)):
        msg = bs_str(xor(bs12[i:i+l], str_bs(guess)))
        p = True
        for char in msg:
            if char not in string.ascii_letters+' ,.\n': 
                p = False
        if p and msg:
            print("{}-{}\t".format(i, i+l), repr(msg))
            

def xcrib(guess, no, bl, i):
    l = len(guess)
    n = len(bl)
    bs_msgs = ["" for _ in range(n)]
    bs_msgs[no] = str_bs(guess)
    for dj in range(n):
        bs_msgs[(no+dj+1)%n] = xor(
            xor(bl[(no+dj)%n], bl[(no+dj+1)%n])[i:i+l], 
            bs_msgs[(no+dj)%n])
    print("Position: {}-{}".format(i, i+l))
    for j in range(n):
        print("{}: {}".format(j, bs_str(bs_msgs[j])))
    

guess = ['','','','']
i = 76
guess[0] = "onking of the way in which one variable depends on the\nother: for instance, we think of the way "
guess[1] = "c said, \"hey, how much do i have to pay you to let me swim in the shark tank?\" i said, \"jason, i"
guess[2] = "how? everybody expects you to win, but nobody ever asks if youre okay. i said, are you okay, juan"
guess[3] = "b with rates of growth. we classify all quantities into two classes:\nconstants and variables. tho"
n = 0
for k in range(4):
    if len(guess[k]) > len(guess[n]): n = k
bl = [ba_bs(b7), ba_bs(b9),ba_bs(b15), ba_bs(b16)]
xcrib(guess[n], n, bl, i)

Position: 76-173
0: onking of the way in which one variable depends on the
other: for instance, we think of the way i
1: c said, "hey, how much do i have to pay you to let me swim in the shark tank?" i said, "jason, i 
2: how? everybody expects you to win, but nobody ever asks if youre okay. i said, are you okay, juan
3: b with rates of growth. we classify all quantities into two classes:
constants and variables. tho


In [655]:


print(bs_hex(xor(ba_bs(b7), ba_bs(b9)))[:10])


bytearray.fromhex("4f")

['04', '4f', '1f', '00', '1b', '56', '16', '07', '5c', '45']


bytearray(b'O')

In [552]:
m1 = "Hello World"
k  = "supersecret"
m2 = "python good"

m1_l = [ord(c) for c in m1]
m2_l = [ord(c) for c in m2]
k_l = [ord(c) for c in k]

c1 = xor(m1_l, k_l)
c2 = xor(m2_l, k_l)
c = xor(c1, c2)

guess = "hello "
drag(guess, -1, c1, c2)
c1

0 	 'Python'
6 	 '\x1fmqoo'
8 	 'ufl'
10 	 'h'


[59, 16, 28, 9, 29, 83, 50, 12, 0, 9, 16]

In [15]:
# 8,11,18, 20


def ngram_frequency(b, n):
    freq = {}
    cnt = 0
    for i in range(len(b)):
        trigram = b[i:i+n]
        freq[trigram] = freq.get(trigram, 0) + 1
        cnt+=1
    for k in freq.keys():
        freq[k] = freq[k]/cnt*100
    return freq

def top_freq(freq):
    return [kv for kv in sorted(freq.items(), key=lambda x: x[1], reverse=True)]

freq_all = ngram_frequency(ctxt(8)+ctxt(11)+ctxt(18)+ctxt(20), 2)
top_freq(freq_all)

[(b'11', 2.291395216754246),
 (b'77', 2.2221904039174696),
 (b'70', 2.0112350434081225),
 (b'68', 1.9902349622714453),
 (b'00', 1.916257403721787),
 (b'17', 1.888575478587076),
 (b'31', 1.863279926308806),
 (b'71', 1.7076884160688803),
 (b'61', 1.6795292163628814),
 (b'16', 1.6117562272399688),
 (b'23', 1.6055516578132236),
 (b'04', 1.5917106952458682),
 (b'38', 1.5229831569803791),
 (b'87', 1.4995967029872614),
 (b'67', 1.4542556187148905),
 (b'88', 1.4480510492881449),
 (b'13', 1.4375510087198065),
 (b'18', 1.4246645952950272),
 (b'46', 1.423232771581163),
 (b'72', 1.4093918090138076),
 (b'73', 1.3478233893176406),
 (b'32', 1.3196641896116417),
 (b'81', 1.308209599900727),
 (b'96', 1.2958004610472358),
 (b'97', 1.291982264476931),
 (b'84', 1.2805276747660161),
 (b'60', 1.230413844780764),
 (b'50', 1.211322861929239),
 (b'47', 1.169322699655885),
 (b'79', 1.1507089913756485),
 (b'53', 1.144027147377615),
 (b'83', 1.1130043002438872),
 (b'05', 1.0982087885339558),
 (b'25', 1.0962996902

In [24]:
# return ctxts (in bytearray) given the file number
def ctxt(file_no:int) -> bytearray : 
    path = 'ctxts/%02d.txt' % file_no
    ctxtfile = open(path)
    ctxt_str = ctxtfile.read()
    ctxt_bytes = ctxt_str.encode('utf-8')
    return ctxt_bytes

b7 = ctxt(7)
b9 = ctxt(9)
b15 = ctxt(15)
b16 = ctxt(16)

def ba_bs(c):
    res = []
    for i in range(0, len(c), 2):
        res.append(int(c[i:i+2], 16))
    return res

def xor(bs1, bs2):
    res = []
    for byte1, byte2 in zip(bs1, bs2):
        res.append(byte1^byte2)
    return res

def str_bs(s):
    return [ord(c) for c in s]

def bs_str(bs):
    return ''.join([chr(b) for b in bs])

def bs_hex(bs):
    return ["%02x" % b for b in bs]

    
def crib(guess, bs1, bs2):
    l = len(guess)
    bs12 = xor(bs1, bs2)
    for i in range(len(bs1)):
        msg = bs_str(xor(bs12[i:i+l], str_bs(guess)))
        p = True
        for char in msg:
            if char not in string.ascii_letters+' ,.\n': 
                p = False
        if p and msg:
            print("{}-{}\t".format(i, i+l), repr(msg))
            

def xcrib(guess, no, bl, i):
    l = len(guess)
    n = len(bl)
    bs_msgs = ["" for _ in range(n)]
    bs_msgs[no] = str_bs(guess)
    for dj in range(n):
        bs_msgs[(no+dj+1)%n] = xor(
            xor(bl[(no+dj)%n], bl[(no+dj+1)%n])[i:i+l], 
            bs_msgs[(no+dj)%n])
    print("Position: {}-{}".format(i, i+l))
    for j in range(n):
        print("\n{}: ".format(j)+ bs_str(bs_msgs[j]))
        

bl = [ba_bs(b9), ba_bs(b16)]

def drag(guess, starting_pos):
    n = 0
    for k in range(2):
        if len(guess[k]) > len(guess[n]): n = k   
    print("Using guess[{}] as reference".format(n))
    xcrib(guess[n], n, bl, starting_pos)
    

    
guess = ["" for _ in range(2)]

# OTP


Goal: stretch the strings as long as possible while they still make sense.
1. Look at the four guesses, if you see a potential expension in any one of them, expand that guess and run the cell
2. Check the output to see if all 4 guesses still make sense. If so, keep expanding. If not, go back and try something else.
3. If you are expanding forward, no need to worry about `starting_pos`. Else deduct the number of characters you expanded in front from `starting_pos`.


In [25]:
starting_pos = 0


g = "i met super smash bros melee legend jason at an aquarium i was working at. he said, \"hey, how much \
do i have to pay you to let me swim in the shark tank?\" i said, \"jason, i \
love and respect you, but you know you wouldn't last ten minutes in that cage.\" he let out a deep sigh and said\
, \"would you say that to 2007 jason?\" i said, \"2007 jason was a legend, but you know who my favorite jason is? \
the one right in front of me.\" he said, \"well, if i'm your favorite, I'm gonna need the keys to the shark tank.\
\" i pulled out the key and said, \"do be careful, jason\""


print(g)
print(len(g)*2)
print(len(ctxt(9)))
print(len(ctxt(16)))


i met super smash bros melee legend jason at an aquarium i was working at. he said, "hey, how much do i have to pay you to let me swim in the shark tank?" i said, "jason, i love and respect you, but you know you wouldn't last ten minutes in that cage." he let out a deep sigh and said, "would you say that to 2007 jason?" i said, "2007 jason was a legend, but you know who my favorite jason is? the one right in front of me." he said, "well, if i'm your favorite, I'm gonna need the keys to the shark tank." i pulled out the key and said, "do be careful, jason"
1122
1122
1034
