In [20]:
# useful functions used below

# https://stackoverflow.com/questions/6224052/what-is-the-difference-between-a-string-and-a-byte-string
# to convert from byte -> string use encode
# to convert from string -> byte use decode

def divide_into_blocks(text, blocksize):
    """divide array into blocks"""
    return [text[i:i + blocksize] for i in range(0, len(text), blocksize)]

def readfile(filename):
    """returns the list of lines in a file with newlines stripped"""
    with open(filename) as f:
        # strip off new lines and create a list of the lines in the file
        return list(map(lambda x: x.rstrip(), list(f)))

This document has Python solutions to the cryptopals [set1 problems](https://cryptopals.com/sets/1).

### Challenge 1: Convert hex to base64

In [21]:
# found here https://stackoverflow.com/questions/33704327/hex-to-base64-conversion-in-python/33704357
import codecs

def hex2base64(s):
    return codecs.encode(codecs.decode(s, 'hex'), 'base64').decode()

hex2base64("49276d206b696c6c696e6720796f757220627261696e206c696b65206120706f69736f6e6f7573206d757368726f6f6d")

'SSdtIGtpbGxpbmcgeW91ciBicmFpbiBsaWtlIGEgcG9pc29ub3VzIG11c2hyb29t\n'

### Challenge 2: fixed XOR

Note: in Python, bytes() is immutable, while bytearray() is a mutable version.

In [22]:
def fixed_XOR(a_hex, b_hex):
    """XOR hex strings together and return the resultant hex string"""
    # convert to raw bytes
    assert len(a_hex)==len(b_hex)
    a_bytes = codecs.decode(a_hex, 'hex')
    b_bytes = codecs.decode(b_hex, 'hex')
    
    c = bytearray()
    for i in range(len(a_bytes)):
        c.append(a_bytes[i] ^ b_bytes[i])
    return codecs.encode(c, 'hex')

a = '1c0111001f010100061a024b53535009181c'
b = '686974207468652062756c6c277320657965'

print(fixed_XOR(a,b))

b'746865206b696420646f6e277420706c6179'


### Challenge 3: Single-byte XOR cipher

The solution suggests using a heuristic to identify the non-jibberish string more easily.  I chose to count the number of A-Z,a-z letters in each string as implemented in ```count_letters()``` and playing with the threshold count for this to limit the number of strings.  The resultant string is:

```
88 27 bytearray(b"Cooking MC\'s like a pound of bacon")
```

In [23]:
import string

def count_letters(ba: bytearray):
    """return a count of the letters A-Z,a-z in the bytearray"""
    count = 0
    for b in ba:
        if b in range(ord('a'), ord('z') + 1) or \
           b in range(ord('A'), ord('Z') + 1) or \
           b==' ':
            count += 1
    return count

# https://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii
# https://stackoverflow.com/questions/42064158/checking-if-a-byte-is-ascii-printable

def is_printable(ba):
    printable_chars = set(bytes(string.printable, 'ascii'))
    return all(char in printable_chars for char in ba)

def num_spaces(ba):
    return sum([char==ord(' ') for char in ba])


# TODO fix threshold
def display_strings(s, threshold=26, convert_to_bytes=True):
    # convert input if necessary to bytearray()
    if convert_to_bytes:
        s_bytes = codecs.decode(s, 'hex')
    else:
        s_bytes = s
    for i in range(1, 256):
        candidate = bytearray([b^i for b in s_bytes])
        if is_printable(candidate) and num_spaces(candidate) > 3:
            print('\t', i, count_letters(candidate), is_printable(candidate), num_words(candidate), candidate)

s_hex = '1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736'
display_strings(s_hex, convert_to_bytes=True)

NameError: name 'num_words' is not defined

### Challenge 4: Detect single-string XOR

The 171-st string contains the string "Now that the party is jumping".

```
171:	7b5a4215415d544115415d5015455447414c155c46155f4058455c5b523f
	 3 24 bytearray(b'xYA\x16B^WB\x16B^S\x16FWDBO\x16_E\x16\\C[F_XQ<')
...
	 53 24 bytearray(b'Now that the party is jumping\n')
...
```

Interestingly, there are many other strings with higher counts.  I should think of a better heuristic...

In [24]:
lines = readfile('4.txt')

i = 1
for line in lines:
    print(f'{i}:\t{line}')
    display_strings(line, 23)
    i += 1

1:	0e3647e8592d35514a081243582536ed3de6734059001e3f535ce6271032
2:	334b041de124f73c18011a50e608097ac308ecee501337ec3e100854201d
3:	40e127f51c10031d0133590b1e490f3514e05a54143d08222c2a4071e351
4:	45440b171d5c1b21342e021c3a0eee7373215c4024f0eb733cf006e2040c
5:	22015e420b07ef21164d5935e82338452f42282c1836e42536284c450de3
6:	043b452e0268e7eb005a080b360f0642e6e342005217ef04a42f3e43113d
7:	581e0829214202063d70030845e5301f5a5212ed0818e22f120b211b171b
8:	ea0b342957394717132307133f143a1357e9ed1f5023034147465c052616
9:	0c300b355c2051373a051851ee154a023723414c023a08171e1b4f17595e
10:	550c3e13e80246320b0bec09362542243be42d1d5d060e203e1a0c66ef48
11:	e159464a582a6a0c50471310084f6b1703221d2e7a54502b2b205c433afa
12:	ec58ea200e3005090e1725005739eda7342aed311001383fff7c58ef1f11
13:	01305424231c0d2c41f105057f74510d335440332f1038ec17275f5814e1
14:	05f12f380720ea2b19e24a07e53c142128354e2827f25a08fb401c3126a6
15:	0d17272f53063954163d050a541b1f1144305ae37d4932431b1f33140b1b
16:	0b4f070f071fe92c200e1fa05e4b27

146:	7e5a19250a5e152b46f5130a094cef08e84704ef10197324464b0114017a
147:	3b56f126390008343d3c400232ed201667211f0b1a1413080202530b08e2
148:	4912321b61c90a0cf6ef0a0a0c0f17fa62eb385e2616194526701aff5fe6
149:	2c57114b0400152d4f2aeb18ed41386c2e3a023a281d1a311eefe750ebab
150:	3a4353282114593b3e36446d2c5e1e582e335337022930331f211604576a
151:	295f3bfae9271ae8065a3b4417545c3e5b0df11a53351c78530915392d2e
152:	074a122ee01b17131e4e124e2322a9560ce4120e37582b24e1036fe93f30
153:	3c08290121090ef72f25e4f220323444532d3fe71f34553c7b2726131009
154:	12e84a3308590357a719e74c4f2133690a20031a0b045af63551325b1219
155:	0e3d4fe03f56523cf40f29e4353455120e3a4f2f26f6a30a2b3e0c5b085a
156:	57f3315c33e41c0f523426232d0651395c1525274e314d0219163b5f181f
157:	53471622182739e9e25b473d74e1e7023d095a3134e62d1366563004120e
158:	230a06431935391d5e0b5543223a3bed2b4358f555401e1b3b5c36470d11
159:	22100330e03b4812e6120f163b1ef6abebe6f602545ef9a459e33d334c2a
160:	463405faa655563a43532cfe154bec32fe3345eb2c2700340811213e5006
161:	14241

NameError: name 'num_words' is not defined

### Challenge 5: Implement Repeating-Key XOR

To convert a string to a list of bytes, use the encode function:
```
s_bytes = s.encode()
```

In [25]:
def repeating_key_XOR(plaintext: str, key: str):
    """given a plaintext and key, return a hex-encode ciphertext comprised of the repeating-key XOR"""
    
    # https://stackoverflow.com/questions/7585435/best-way-to-convert-string-to-bytes-in-python-3
    key_bytes = key.encode()
    plaintext_bytes = plaintext.encode()
    
    i = 0
    ciphertext_bytes = bytearray()
    for b in plaintext_bytes:
        ciphertext_bytes.append(b ^ key_bytes[i])
        i = (i + 1) % len(key_bytes)

    return codecs.encode(ciphertext_bytes, 'hex')

plaintext = 'Burning \'em, if you ain\'t quick and nimble\nI go crazy when I hear a cymbal'
key = 'ICE'
repeating_key_XOR(plaintext, key)

b'0b3637272a2b2e63622c2e69692a23693a2a3c6324202d623d63343c2a26226324272765272a282b2f20430a652e2c652a3124333a653e2b2027630c692b20283165286326302e27282f'

### Challenge 6: Break repeating-key XOR

Compute the hamming distance between two strings

In [26]:
def hamming(a, b, convert_to_bytes=True):
    """return the hamming distance between two strings"""
    
    def bit_diff(a_byte, b_byte):
        """return the number of bits different between two bytes"""
        return sum([(a_byte >> j) & 0x1 != (b_byte >> j) & 0x1 for j in range(8)])
    
    assert len(a)==len(b)
    
    if convert_to_bytes:
        a, b = a.encode(), b.encode()
    
    return sum([bit_diff(a[i], b[i]) for i in range(len(a))])

assert hamming('this is a test', 'wokka wokka!!!', True)==37

Read in the file and convert from base64 to byte array

In [27]:
import base64

# get the text as a list of bytes
with open('6.txt') as f:
    # strip off new lines and create a list of the lines in the file
    lines = map(lambda x: x.rstrip('\n'), list(f))

# join the lines and decode as base64 to bytearray
text = base64.b64decode(''.join(lines))

Follow steps 3 and 4

In [28]:
# step 3: For each KEYSIZE, take the first KEYSIZE worth of bytes,
# and the second KEYSIZE worth of bytes, and find the edit distance between them.
#Normalize this result by dividing by KEYSIZE
l_edit_distance = []
for keylength in range(1, 40):
    a = text[0:keylength + 1]
    b = text[keylength:2*keylength + 1]
    l_edit_distance.append((keylength, hamming(a, b, False)/keylength))

# step 4: The KEYSIZE with the smallest normalized edit distance is probably the key.
# You could proceed perhaps with the smallest 2-3 KEYSIZE values. Or take 4 KEYSIZE blocks
# instead of 2 and average the distances
l_edit_distance.sort(key=lambda x: x[1])
for d in l_edit_distance[0:10]:
    print(d)

(5, 1.6)
(20, 2.75)
(13, 2.769230769230769)
(38, 2.973684210526316)
(18, 3.0)
(11, 3.090909090909091)
(16, 3.125)
(37, 3.1621621621621623)
(17, 3.176470588235294)
(8, 3.25)


In [29]:
# step 5: Now that you probably know the KEYSIZE: break the ciphertext into blocks of KEYSIZE length.

def divide_by_keysize(text, keysize):
    return [text[i:i + keysize] for i in range(0, len(text), keysize)]
    

#print(len(text))
#list(map(len, divide_by_keysize(text, 5)))


In [30]:
def transpose(l_block, keysize):
    def get_kth_byte(l_block, k):
        l_word = bytearray()
        for b in l_block:
            try:
                l_word.append(b[k])
            except:
                break
        return l_word
    
    return [get_kth_byte(l_block, k) for k in range(keysize)]

text_div = divide_by_keysize(text, 20)
l_strings = transpose(text_div, 20)

print(display_strings(l_strings[19], 90, False))


None


### Challenge 7: AES in ECB mode

This [link](https://techtutorialsx.com/2018/04/09/python-pycrypto-using-aes-128-in-ecb-mode/) has a good tutorial on using AES in ECB mode.

This challenge required a number of conversions:
1. a string of base-64 to a list of bytes of base-64 encoded data.  convert with ```lines.encode()```
2. bytes of base-64 encoded data to raw bytes.  convert with ```codecs.decode(..., 'base64')```
3. raw bytes to convert to a string ```string.decode('utf-8')```

In [15]:
from Crypto.Cipher import AES
import codecs


# the text file is base64 which needs to be an array of bytes
with open('7.txt') as f:
    # strip off new lines and create a list of the lines in the file
    lines = list(map(lambda x: x.rstrip('\n'), list(f)))
    lines = ''.join(lines)
    
    # lines.encode() converts string to bytes
    # codes.decode(_, 'base64') converts base64 encoded bytes to regular bytes
    text = codecs.decode(lines.encode(), 'base64')


key = "YELLOW SUBMARINE"
decipher = AES.new(key, AES.MODE_ECB)
print(decipher.decrypt(text).decode('utf-8'))

I'm back and I'm ringin' the bell 
A rockin' on the mike while the fly girls yell 
In ecstasy in the back of me 
Well that's my DJ Deshay cuttin' all them Z's 
Hittin' hard and the girlies goin' crazy 
Vanilla's on the mike, man I'm not lazy. 

I'm lettin' my drug kick in 
It controls my mouth and I begin 
To just let it flow, let my concepts go 
My posse's to the side yellin', Go Vanilla Go! 

Smooth 'cause that's the way I will be 
And if you don't give a damn, then 
Why you starin' at me 
So get off 'cause I control the stage 
There's no dissin' allowed 
I'm in my own phase 
The girlies sa y they love me and that is ok 
And I can dance better than any kid n' play 

Stage 2 -- Yea the one ya' wanna listen to 
It's off my head so let the beat play through 
So I can funk it up and make it sound good 
1-2-3 Yo -- Knock on some wood 
For good luck, I like my rhymes atrocious 
Supercalafragilisticexpialidocious 
I'm an effect and that you can bet 
I can take a fly girl and make her wet. 


## Challenge 8: Detect AES in ECB mode

The duplicate line is:

```
d880619740a8a19b7840a8a31c810a3d08649af70dc06f4fd5d2d69c744cd283e2dd052f6b641dbf9d11b0348542bb5708649af70dc06f4fd5d2d69c744cd2839475c9dfdbc1d46597949d9c7e82bf5a08649af70dc06f4fd5d2d69c744cd28397a93eab8d6aecd566489154789a6b0308649af70dc06f4fd5d2d69c744cd283d403180c98c8f6db1f2a3f9c4040deb0ab51b29933f2c123c58386b06fba186a
```

In [267]:
with open('8.txt') as f:
    # strip off new lines and create a list of the lines in the file
    lines = list(map(lambda x: x.rstrip('\n'), list(f)))

def divide_into_blocks(text, blocksize):
    """divide string int blocks"""
    return [text[i:i + blocksize] for i in range(0, len(text), blocksize)]

def duplicates(l: list):
    """returns true if there are duplicates in the list l"""
    return len(l)!=len(set(l))

for line in lines:
    # note: for a 16B block used in a AES, the blocksize should be 2*16, since the file
    # is in hexadecimal which represents bytes by 2 characters
    b = divide_into_blocks(line, blocksize=32)
    # print(b)
    if duplicates(b):
        print('duplicate:', line, b)

duplicate: d880619740a8a19b7840a8a31c810a3d08649af70dc06f4fd5d2d69c744cd283e2dd052f6b641dbf9d11b0348542bb5708649af70dc06f4fd5d2d69c744cd2839475c9dfdbc1d46597949d9c7e82bf5a08649af70dc06f4fd5d2d69c744cd28397a93eab8d6aecd566489154789a6b0308649af70dc06f4fd5d2d69c744cd283d403180c98c8f6db1f2a3f9c4040deb0ab51b29933f2c123c58386b06fba186a ['d880619740a8a19b7840a8a31c810a3d', '08649af70dc06f4fd5d2d69c744cd283', 'e2dd052f6b641dbf9d11b0348542bb57', '08649af70dc06f4fd5d2d69c744cd283', '9475c9dfdbc1d46597949d9c7e82bf5a', '08649af70dc06f4fd5d2d69c744cd283', '97a93eab8d6aecd566489154789a6b03', '08649af70dc06f4fd5d2d69c744cd283', 'd403180c98c8f6db1f2a3f9c4040deb0', 'ab51b29933f2c123c58386b06fba186a']
