In [1]:
from typing import Set, Dict, List
import numpy as np

In [2]:
cipher_text = "yysjxbysjxnkwkpjucfhyeswwyysotqcvwbxrzdinessgizoetugzwhwtvskidmsiucslkjvgobqcbwo"\
            + "jiuwmftfglxrbvxfkhzxhfbwlzerwkmzghbsvgamjczzbrysggqpgsrxxcgwkvbuxxdocxlfcvgjzuzut"\
            + "igkiwzbybxkvwfnjqzbjwwffjrbvbbfbvxwztavtlzvizkofhyzcfbsywkajrroaduclajpasdjxcgwsvw"\
            + "yagffkbxehambysjxyysjxfisuhbjpmmmvfwmmvfwtwvbgvtngtxkffwbglwdmfnodenuokdyfyfhbnvsm"\
            + "nnokpfczaglzbgkbrzdbsxcmmferlhbycebbrgdbpvhgznmsgykvbkxxfawmmzbymmvfwbxkvsmifskgyc"\
            + "cnxfnodeyyolpfehkbyucogntcmeijoqxqmskmtyweuzkwllsfhweavgwqfthdrferawwrhzxwysktnuwl"\
            +"ytivafxvzxbxvszbrkvwkjsfaglzbytxkcfxliokijutakrcmtryyslhuzbwthyvsgicwcxfecdwxkcfxxr"\
            + "jszjrfexiysehavgagirfcgjjgslnkgwxrjhgfjeclhkncgwxfbdrferlajjvswjftlkjvgzxbzzdgtkug"\
            + "ujywfwmzgxtyysjlxrmaglrbvajcwcxxyonbsxhzhzxvlhkzhkhbvzdaj"\
            + "joqlfxoaglfcvyjeqwlrrywztfrfxnxvthwj"

In [3]:
proba_distrib: Dict[str, float] = {"a": 0.082, "b": 0.015,
                                  "c": 0.028, "d": 0.043,
                                  "e": 0.127, "f": 0.022,
                                  "g": 0.020, "h": 0.061,
                                  "i": 0.070, "j": 0.002,
                                  "k": 0.008, "l": 0.040,
                                  "m": 0.024, "n": 0.067,
                                  "o": 0.075, "p": 0.019,
                                  "q": 0.001, "r": 0.060,
                                  "s": 0.063, "t": 0.091,
                                  "u": 0.028, "v": 0.010,
                                  "w": 0.023, "x": 0.001,
                                  "y": 0.020, "z": 0.001}
alphabet = sorted(list(proba_distrib.keys()))

In [4]:
def compute_n_grams_occurences(text: str, n: int) -> Dict[str, List[int]]:
    """
    Returns a dictionary that maps substrings of text of length n to the indices
    at which these substrings occur.
    """
    n_grams_occurences = dict()
    for index in range(len(text) - n):
        n_gram = text[index: index + n]
        n_grams_occurences[n_gram] = n_grams_occurences.get(n_gram, [])
        n_grams_occurences[n_gram].append(index)
    
    return n_grams_occurences

We are only going to keep the trigrams that are repeated more than 3 time to make sure we didn't stumble onto random repetitions

In [5]:
keys_repeated = dict()
for key, occurences in compute_n_grams_occurences(cipher_text, 3).items():
    if len(occurences) > 2:
        keys_repeated[key] = occurences

keys_repeated

{'yys': [0, 25, 260, 540, 665],
 'ysj': [1, 6, 256, 261, 666],
 'sjx': [2, 7, 257, 262],
 'rbv': [91, 186, 676],
 'hzx': [97, 477, 694],
 'cgw': [137, 237, 617],
 'zby': [166, 386, 516],
 'mmv': [274, 279, 389],
 'mvf': [275, 280, 390],
 'vfw': [276, 281, 391],
 'agl': [333, 513, 673, 718],
 'fer': [350, 470, 625],
 'rfe': [469, 576, 624]}

Now we're going the compute the distances between the keys.

In [6]:
def compute_distances(occurences_dict: Dict[str, List[int]]) -> Dict[str, List[int]]:
    """
    Computes the distance between the first occurence of an ngram and each other 
    occurence; for every ngram.
    If an ngram occurs less than twice, whe don't include it in the dictionary 
    we return.
    Returns the dictionnary that maps each ngram to its set of distances.
    """
    n_grams_distances = dict()
    
    for ngram, indexes in occurences_dict.items():
        if len(indexes) < 2:
            continue
        
        n_grams_distances[ngram] = list()
        # We suppose that the indexes are stored in increasing order
        # thus the first occurence index is stored in indexes[0]
        first_occurence = indexes[0]
        for index in set(indexes[1:]):
            n_grams_distances[ngram].append(index - \
                             first_occurence)
    
    return n_grams_distances

distances_dict = compute_distances(keys_repeated)
distances_dict

{'yys': [25, 665, 260, 540],
 'ysj': [255, 665, 260, 5],
 'sjx': [255, 260, 5],
 'rbv': [95, 585],
 'hzx': [380, 597],
 'cgw': [480, 100],
 'zby': [220, 350],
 'mmv': [115, 5],
 'mvf': [5, 115],
 'vfw': [5, 115],
 'agl': [180, 340, 385],
 'fer': [275, 120],
 'rfe': [107, 155]}

In [7]:
possible_key_lengths = set()
    
for distances in distances_dict.values():
    gcd = np.gcd.reduce(distances)
    if (gcd > 1):
        possible_key_lengths.add(gcd)

possible_key_lengths

{5, 10, 20}

The smallest possible key_length is 5, so let's try solving the problem with that.

In [8]:
def compute_occurences(text, alphabet) -> Dict[str, int]:
    """
    Computes the occurences of each character.
    """
    occurences_dict = dict(zip(alphabet, [0]*26))
    for letter in text:
        occurences_dict[letter] = occurences_dict.get(letter, 0) + 1
    
    return occurences_dict





In [9]:
def y_matrix(cipher_text: str, key_length: int) -> List[List[str]]:
    """
    Each line of the matrix contrain the letters that are supposedly ecrypted by
    the same letter.
    """
    #Creating the matrix
    y_matrix = list()
    for i in range(key_length):
        y_matrix.append(list())

    # Now that we have our matrix, let's fill it up
    for i in range(len(cipher_text)):
        y_matrix[i%key_length].append(cipher_text[i])
    
    return y_matrix
y_ma = y_matrix(cipher_text, 5)


In [10]:
def compute_mg(yi: List[str], cipher_text: str, \
               proba_distrib: Dict[str, float], key_length: int) -> List[float]:
    """
    Computes the Mg quantity of a line of letters suposedly encoded by the same
    letter.
    """
    # The letters are stored in alphabetical order
    alphabet = sorted(list(proba_distrib.keys()))
    letter_indexes = dict(zip(alphabet, range(26)))

    n_ = len(cipher_text) / key_length # length of the string yi
    #frequency of each letter in the substring yi
    frequencies_in_yi: Dict[str, int] = compute_occurences(yi, alphabet) 
    mg_vector = []
    for g in range(len(alphabet)):
        #here we are iterating over the possible keys denoted by g
        mg = 0
        for i in range(len(frequencies_in_yi)):
            #here we are iterating over the letters in yi
            pi = list(proba_distrib.values())[i]
            fi_plus_g = list(frequencies_in_yi.values())[(i+g)%26]
            mg += pi * fi_plus_g
        mg = mg/n_
        mg_vector.append(mg)
    
    return mg_vector

In [11]:
key_length = 5
key = []
for i in range(key_length):
    mg_vect = compute_mg(y_ma[i], cipher_text, proba_distrib, key_length)
    max_mg = max(mg_vect)
    index_max = [ind for ind, val in enumerate(mg_vect) if val == max_mg][0]
    key.append(alphabet[index_max])

key


['f', 'r', 'o', 's', 't']

Now that we have the key, let's decrypt the cipher_text:

In [12]:
def decrypt(ciphertext, key):
    key_length = len(key)
    key_as_int = [ord(i) for i in key]
    ciphertext_int = [ord(i) for i in ciphertext]
    plaintext = ''
    for i in range(len(ciphertext_int)):
        value = (ciphertext_int[i] - key_as_int[i % key_length]) % 26
        plaintext += chr(value + 65)
    return plaintext
plain_text = decrypt(cipher_text, key)

The plaintext obtained is:

THEREWHEREITISWEDONOTNEEDTHEWALLHEISALLPINEANDIAMAPPLEORCH
ARDMYAPPLETREESWILLNEVERGETACROSSANDEATTHECONESUNDERHISPIN
ESITELLHIMHEONLYSAYSGOODFENCESMAKEGOODNEIGHBORSSPRINGISTHE
MISCHIEFINMEANDIWONDERIFICOULDPUTANOTIONINHISHEADWHYDOTHEY
MAKEGOODNEIGHBORSISNTITWHERETHEREARECOWSBUTHERETHEREARENOC
OWSBEFOREIBUILTAWALLIDASKTOKNOWWHATIWASWALLINGINORWALLINGO
UTANDTOWHOMIWASLIKETOGIVEOFFENSESOMETHINGTHEREISTHATDOESNT
LOVEAWALLTHATWANTSITDOWNICOULDSAYELVESTOHIMBUTITSNOTELVESE
XACTLYANDIDRATHERHESAIDITFORHIMSELFISEEHIMTHEREBRINGINGAST
ONEGRASPEDFIRMLYBYTHETOPINEACHHANDLIKEANOLDSTONESAVAGEARME
DHEMOVESINDARKNESSASITSEEMSTOMENOTOFWOODSONLYANDTHESHADEOF
TREESHEWILLNOTGOBEHINDHISFATHERSSAYINGANDHELIKESHAVIN

THOUGHTOFITSOWELLHESAYSAGAINGOODFENCESMAKEGOODNEIGHBORS