# Cipher Decription
* This project is aimed at producing a decipher that discovers the correct mapping &lt;code_letter&gt;:&lt;real_letter&gt; from secret messages.
* The solution is based on the concept of Genetic Algorithms. Each decipher is basically a dictionary that represents a hypothetical mapping of letters.
* Each offspring will inherit some of the mappings of their parent's dictionary, but with a slight modification/mutation.

In [1]:
from re import sub
from typing import List

def load_text(filename:str)->List[str]:
    '''
        Reads the .txt file.
        
        Parameter
        ---------
        `filename`: str
            The name of the poems file.
            
        Returns
        -------
        A list containing each strophe's content.
    '''
    _keep_valid_parag = lambda s: len(s)>0
    with open(f'/kaggle/input/cipher/{filename}', 'r') as f:
        parag_delim = '\n\n'
        text = sub('(\ufeff|\n)?CHAPTER \d+\.[\w\'`,;\. \-]*', '', f.read()).split(parag_delim)
        return list(filter(_keep_valid_parag, text))
    
corpus_model = load_text('moby_dick.txt')
corpus_model

['Call me Ishmael. Some years ago—never mind how long precisely—having\nlittle or no money in my purse, and nothing particular to interest me\non shore, I thought I would sail about a little and see the watery part\nof the world. It is a way I have of driving off the spleen and\nregulating the circulation. Whenever I find myself growing grim about\nthe mouth; whenever it is a damp, drizzly November in my soul; whenever\nI find myself involuntarily pausing before coffin warehouses, and\nbringing up the rear of every funeral I meet; and especially whenever\nmy hypos get such an upper hand of me, that it requires a strong moral\nprinciple to prevent me from deliberately stepping into the street, and\nmethodically knocking people’s hats off—then, I account it high time to\nget to sea as soon as I can. This is my substitute for pistol and ball.\nWith a philosophical flourish Cato throws himself upon his sword; I\nquietly take to the ship. There is nothing surprising in this. If they\nbut kn

In [2]:
import string
from re import sub
from nltk import word_tokenize
from typing import List

def remove_punctuation(s:str)->str:
    '''
        Removes punctuation from a string. Also, already normalizes (lower) and removes any whitespaces
        from the string's borders.
        
        Parameter
        ---------
        s: `str`
            The provided string.
        
        Returns
        -------
        The treated string.
    '''
        
    translation_table = str.maketrans('', '', string.punctuation+ '—“”‘‘\n')
    return s.lower().strip().translate(translation_table)

def treat(s:str)->str:
    '''
        Applies all transformations mentioned above in a text.
        
        Parameter
        ---------
        s: `str`
            The provided string.

        Returns
        -------
        The treated string.
    '''
    s = remove_punctuation(s)
    s = s.replace(' ','')
    return s

corpus_model = list(map(treat, corpus_model))

In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from typing import List, Tuple

class MarkovModel:
    '''
       Markov Model, with Add-Epsilon Smoothing.

        Parameters
        ---------
        `corpus`: List[str]
            List with the documents to be used.
        `epsilon`: float
            Smoothing degree of the probabilities.
        `name`: str
            A name for your model.
            
        Methods
        ------
        `fit`: Generates the model's A and pi.
        `predict_log_proba`: Estimates the probability's log of a given sequence.
        
        
        Attributes
        ----------
        `a`: `pd.DataFrame`
            The model's A matrix.
        `_a`: Dict[str, Dict[str, int]]
            A Dictionary mapping the number of occurences a given state transition happened.
        `pi`: `pd.Series`
            The model's pi vector.
        `_pi`: Dict[str, int]
            A dictionary informing the amount of times a given token started a sentence.
        `_vocab`: Set[str]
            A set object with all the corpus's vocabulary.
    '''
    def __init__(self, corpus:List[str], epsilon:float, name:str):
        self.corpus = self.split_corpus(corpus)
        self.corpus_length = len(self.corpus)
        self.epsilon = epsilon
        self.name = name

    @staticmethod
    def split_corpus(corpus:List[str])->List[List[str]]:
        '''
            Tokenizes the corpus' documents.
            
            Parameter
            ---------
            `corpus`: List[str]
                A list with each of the corpus' documents.
                
            Returns
            -------
            A list of the documents tokens.
        '''
        return [word_tokenize(document.lower()) for document in corpus]
    
    def __vocab(self)->None:
        '''
            Extraction of all the corpus tokens.
            
            We create a set with all training tokens and another one disregarding the ones only used as first word of the strophes.
        '''
        self._vocab, self._a_vocab = [], []
        
        for doc in self.corpus:
            self._vocab += doc
            self._a_vocab+=doc[1:] # Not including the first tokens.
            
        self._vocab, self._a_vocab = set(self._vocab), set(self._a_vocab)        
    
    def __check_pi(self, token:str)->str:
        '''
            Masks a sentence's first token with '<UNKNOWN>' mark if it is not included in the training set.
            
            Parameter
            ---------
            `token`: str
                The sentence's first token under scrutiny
            
            Returns
            -------
            The treated token.
        '''
        return token if token in self._pi else '<UNKNOWN>'
    
    def __check_a(self, token1:str, token2:str)->Tuple[str]:
        '''
            When querying the model's A matrix, checks whether the provided initial and target states are present. If not,
            the tokens are masked with the flag '<UNKNOWN>'.
            
            Parameters
            ----------
            `token1`: str
                The initial state.
            `token2`: str
                The target state.
            
            Returns
            -------
            The treated tokens inside a tuple.
        '''
        token1 = token1 if token1 in self.a.index else '<UNKNOWN>'
        token2 = token2 if token2 in self.a.columns else '<UNKNOWN>'
        return token1, token2
    
    def __pi(self):
        '''
            Encharged for measuring the model's pi vector.
        '''
        self._pi = {}
        m = self.a.shape[0]
        
        for doc in self.corpus:
            i = doc[0]
            if i not in self._pi.keys():
                self._pi[i] = 1
            else:
                self._pi[i]+=1
        
        self._pi['<UNKNOWN>'] = 0 # Defining a key for possible tokens of the test set that were unseen during training.
        self.pi =  (pd.Series(self._pi)+self.epsilon) / (self.corpus_length+self.epsilon*m)
        
    def __a(self):
        '''
            Measures the model's A matrix.
        '''
        self._a = {j:{} for j in self._a_vocab}
        for doc in self.corpus:
            for idx, j in enumerate(doc[1:], start=1):
                d_j = self._a[j]
                i = doc[idx-1]
                if i not in d_j.keys():
                    d_j[i] = 1
                else:
                    d_j[i] += 1
        self._a['<UNKNOWN>'] = {'<UNKNOWN>':0}
        a = pd.DataFrame(self._a).fillna(0)
        num = (a+self.epsilon)
        denom = a.sum(axis=1, skipna=True)+a.shape[0]*self.epsilon
        self.a =  num.div(denom, axis=0) 
        

    def fit(self):
        '''
            Fits the algorithm to the provided corpus.
        '''
        self.__vocab()
        self.__a()
        self.__pi()
        return self
    
    def predict_log_proba(self, text:str)->float:
        '''
            Estimates the probability's log of a given sequence.
            
            Parameter
            ---------
            `text`: str
                The text whose probability needs to be computed.
            
            Returns
            -------
            The sequence's log probability.
        '''
        text = word_tokenize(text.lower())
        proba_pi = np.log(self.pi[self.__check_pi(text[0])])
        proba_a = np.log([self.a.loc[self.__check_a(text[i], text[i+1])] for i, _ in enumerate(text[:-1])]) 
        return proba_pi + np.sum(proba_a)                                                   
    
    def predict_proba(self, text:str)->float:
        '''
            Estimates the probability of a given sequence.
            
            *Note:* There is a risk of the output to be 0 for long sequences.
            
            Parameter
            ---------
            `text`: str
                The text whose probability needs to be computed.
            
            Returns
            -------
            The sequence's probability.
        '''
        return np.exp(self.predict_log_proba(text))
    
    def predict_log_proba_author(self, text:str)->float:
        '''
            Measures the likelihood that a given text was written by the model's author by Bayes' Theorem.
            
            Parameter
            ---------
            `text`: str
                The text under scrutiny.
            
            Returns
            -------
            The computed probability.
        '''
        global probas
        return self.predict_log_proba(text) + np.log(probas[self.name])

<p style='color:red'> Mantive apenas os caracteres dos documentos. Revisar a classe do modelo e aplicá-lo ao corpus do projeto.</p>