In [12]:
import collections
import string
import typing as t

In [13]:
# From https://docs.google.com/document/d/1BBCE0I0ylzc9sbcaD83qk2jQphFUQ1zoWH8Z9XGU5YQ/edit#
cipher = """BI FQMOM FHI

MC BADI EI JCJF DIDI GCLF.
HCF NMPF AHIBA IOMJI EI DCGITJI.
HCF QFDC AHIBA DMNMAJI EI NMEARCJI.
PMHABC EA LFTGM JAEM HCQAKA QC DAGA DMJA BI GCLF RIEA DFF KIDA RIEA LAEFM.
HCF BFGC HMOIDM DFI OMEAA IQIKIU.
HCF LCGI OIU OFLAJCV BI RMHMSI ASAJI EI GAHIJAJI.
PIHCT EA
GAHIJA OMNMSI QAF DMJA DMSFTI. IKM HCPISM MC OIU IGI DIDF GASFJF EFNM NCRI. FMM DIDI IGI HCM FNMF EI HCM DFMFOF BI SCOC DMJA BI GCLF. MC SAQM NFMEI EI EMDM LCCQI. MC DAGCKM BF QFDC EAAKM DIDI IGI IKM MC SAQM NCOFC BI FQMOM FHI. AGF KMPCEA DIDI HMI ECOM OIU BMQM DMJA NCBF GAHOJAJI.

CQC KIDA RIEA KCNC. DFI HCF BI PMKI DMJA BIKM QAKF QIBIRI BF BI IJCKIDA BI MCQI LCGI BI SFTJIQF ADCUKMJI DMJA BI IGIC EI BI LMRA BI GILCQIU DMLF BI NFQA GFSCJI DMJA BI EISI EMLF BI FCEC GFCSC RMKC DMJA BI LAUI EMLF BI KCHFOA AFJI DMJA BI TCUIQ EMLF BI ABAJI DMJA BI IOCSMQ EMLF BI PMKI DIDF OAVSI NFDM BI JFGM DMJA BI FVBCGC EMLF BI OAOM HCDCTHC DMJA LCVBMT.
PIHCT LI ITMTC BI HFUQIJI DMJA NCBA GAHIJAJI QMKC OIU LMOMNI QAF OIU LMOMNI HFUQIRC MHF COF ANA LASF DMJA BIKM QC HFDI JABM DIITPM JAEMNI EI JFQMKC HIPI. DFI IJA PIKIT NCRI BI LCRIJI DFEFPI PFEARF EI FI QC CDC BI LOJCDFJI DCBFRC EFNM EMLF JCKFEA QC EFNM BI QA HFDI DMJA KCPFJI. IKM HCF NCRI BI OIJA DMJA OFKMMJI NFMEI NMIDF CQC RFIC ECOM QC BA PFEARF."""

In [24]:
def pairs(word: str, joiner="") -> t.Iterable[str]:
    """Extracts digrams (pairs of consecutive letters) from a word"""
    it1, it2 = iter(word), iter(word)
    next(it2)
    return list(joiner.join(e) for e in zip(it1, it2))

In [15]:

def words(text: str):
    """splits a text into words, cleening it too (probably naive...)"""
    text = text.upper()
    text = "".join(c if c in string.ascii_uppercase else " " for c in text)
    return text.split()

In [29]:
from collections import Counter
def most_frequent_digrams(digrams: t.Iterable[str], freq=True) -> t.Dict[str, float]:
    """Returns frequency (0<x<1) of each digram"""
    c = Counter(digrams)
    common = dict(c.most_common())
    if not freq:
        return common
    
    total = sum(c.values())
    return {k: v/total for k, v in common}

In [17]:
# I'm assuming the style of Traditions matches the style of the bible, as a seemingly sacred text
with open("genesis.txt") as f:
    bible = words(f.read())

In [18]:
def _surroundings(word, words, n=1):
    for i, w in enumerate(words):
        if w == word:
            yield words[i-n:i+n+1]
            
def surroundings(*a):
    return list(_surroundings(*a))

In [19]:
most_frequent_digrams(dg for w in words(cipher) for dg in pairs(w))

{'BI': 0.04417670682730924,
 'JA': 0.03346720214190094,
 'DM': 0.029451137884872823,
 'MJ': 0.025435073627844713,
 'JI': 0.024096385542168676,
 'DI': 0.018741633199464525,
 'HC': 0.018741633199464525,
 'EI': 0.01606425702811245,
 'EA': 0.01606425702811245,
 'ID': 0.014725568942436412,
 'LF': 0.014725568942436412,
 'OM': 0.013386880856760375,
 'DF': 0.013386880856760375,
 'KM': 0.013386880856760375,
 'NM': 0.012048192771084338,
 'EM': 0.012048192771084338,
 'HI': 0.0107095046854083,
 'GI': 0.0107095046854083,
 'RI': 0.0107095046854083,
 'IJ': 0.0107095046854083,
 'QM': 0.009370816599732263,
 'DC': 0.009370816599732263,
 'AJ': 0.009370816599732263,
 'QC': 0.009370816599732263,
 'KI': 0.009370816599732263,
 'OI': 0.009370816599732263,
 'QI': 0.009370816599732263,
 'IK': 0.009370816599732263,
 'IU': 0.009370816599732263,
 'NC': 0.009370816599732263,
 'ML': 0.009370816599732263,
 'MC': 0.008032128514056224,
 'BA': 0.008032128514056224,
 'GC': 0.008032128514056224,
 'CF': 0.00803212851405622

In [20]:
most_frequent_digrams(dg for w in bible for dg in pairs(w))

{'TH': 0.05882095228081244,
 'HE': 0.05257347142631828,
 'AN': 0.043592170057655574,
 'ND': 0.038439969857876385,
 'ER': 0.020617563044354484,
 'HA': 0.017279147608783275,
 'RE': 0.016928657799274487,
 'IN': 0.01567565673028057,
 'HI': 0.01557050978742793,
 'TO': 0.013520144401801517,
 'AT': 0.013029458668489214,
 'OF': 0.012696493349455865,
 'IS': 0.012346003539947076,
 'EN': 0.012118185163766363,
 'ED': 0.011750170863782135,
 'SE': 0.011355869828084748,
 'NT': 0.011215673904281234,
 'OU': 0.011180624923330355,
 'AR': 0.010479645304312777,
 'OR': 0.010304400399558383,
 'ME': 0.009533322818639047,
 'VE': 0.009366840159122373,
 'ON': 0.009349315668646933,
 'LL': 0.009244168725794297,
 'EA': 0.008797294218670593,
 'SA': 0.008648336049629357,
 'AL': 0.008499377880588122,
 'BE': 0.008332895221071447,
 'IT': 0.00823651052345653,
 'ES': 0.008140125825841614,
 'UN': 0.008017454392513538,
 'AM': 0.007815922752045984,
 'LE': 0.007465432942537196,
 'AS': 0.007404097225873158,
 'SH': 0.0073690482

In [21]:
cipher_words = words(cipher)
nl = "\n"
for k, v in collections.Counter(cipher_words).most_common():
    if v == 1:
        break
    print(f"{k}: {v}")
    for e in surroundings(k, cipher_words):
        print(f"    {' '.join(e)}")

BI: 30
    
    DMJA BI GCLF
    OFLAJCV BI RMHMSI
    DFMFOF BI SCOC
    DMJA BI GCLF
    NCOFC BI FQMOM
    HCF BI PMKI
    BF BI IJCKIDA
    IJCKIDA BI MCQI
    LCGI BI SFTJIQF
    DMJA BI IGIC
    EI BI LMRA
    LMRA BI GILCQIU
    DMLF BI NFQA
    DMJA BI EISI
    EMLF BI FCEC
    DMJA BI LAUI
    EMLF BI KCHFOA
    DMJA BI TCUIQ
    EMLF BI ABAJI
    DMJA BI IOCSMQ
    EMLF BI PMKI
    NFDM BI JFGM
    DMJA BI FVBCGC
    EMLF BI OAOM
    ITMTC BI HFUQIJI
    NCRI BI LCRIJI
    CDC BI LOJCDFJI
    EFNM BI QA
    NCRI BI OIJA
DMJA: 16
    DAGA DMJA BI
    QAF DMJA DMSFTI
    SCOC DMJA BI
    BMQM DMJA NCBF
    PMKI DMJA BIKM
    ADCUKMJI DMJA BI
    GFSCJI DMJA BI
    RMKC DMJA BI
    AFJI DMJA BI
    ABAJI DMJA BI
    JFGM DMJA BI
    HCDCTHC DMJA LCVBMT
    HFUQIJI DMJA NCBA
    LASF DMJA BIKM
    HFDI DMJA KCPFJI
    OIJA DMJA OFKMMJI
EI: 9
    BADI EI JCJF
    IOMJI EI DCGITJI
    DMNMAJI EI NMEARCJI
    ASAJI EI GAHIJAJI
    FNMF EI HCM
    NFMEI EI EMDM
    IGIC EI BI
    JAE

In [30]:
most_frequent_digrams(pairs(words(cipher), joiner=" "), freq=False)

{'DMJA BI': 8,
 'EMLF BI': 5,
 'BI FQMOM': 2,
 'FQMOM FHI': 2,
 'BI GCLF': 2,
 'KIDA RIEA': 2,
 'DIDI IGI': 2,
 'MC SAQM': 2,
 'BI PMKI': 2,
 'DMJA BIKM': 2,
 'OIU LMOMNI': 2,
 'NCRI BI': 2,
 'FHI MC': 1,
 'MC BADI': 1,
 'BADI EI': 1,
 'EI JCJF': 1,
 'JCJF DIDI': 1,
 'DIDI GCLF': 1,
 'GCLF HCF': 1,
 'HCF NMPF': 1,
 'NMPF AHIBA': 1,
 'AHIBA IOMJI': 1,
 'IOMJI EI': 1,
 'EI DCGITJI': 1,
 'DCGITJI HCF': 1,
 'HCF QFDC': 1,
 'QFDC AHIBA': 1,
 'AHIBA DMNMAJI': 1,
 'DMNMAJI EI': 1,
 'EI NMEARCJI': 1,
 'NMEARCJI PMHABC': 1,
 'PMHABC EA': 1,
 'EA LFTGM': 1,
 'LFTGM JAEM': 1,
 'JAEM HCQAKA': 1,
 'HCQAKA QC': 1,
 'QC DAGA': 1,
 'DAGA DMJA': 1,
 'GCLF RIEA': 1,
 'RIEA DFF': 1,
 'DFF KIDA': 1,
 'RIEA LAEFM': 1,
 'LAEFM HCF': 1,
 'HCF BFGC': 1,
 'BFGC HMOIDM': 1,
 'HMOIDM DFI': 1,
 'DFI OMEAA': 1,
 'OMEAA IQIKIU': 1,
 'IQIKIU HCF': 1,
 'HCF LCGI': 1,
 'LCGI OIU': 1,
 'OIU OFLAJCV': 1,
 'OFLAJCV BI': 1,
 'BI RMHMSI': 1,
 'RMHMSI ASAJI': 1,
 'ASAJI EI': 1,
 'EI GAHIJAJI': 1,
 'GAHIJAJI PIHCT': 1,
 'PIH