In [1]:
import numpy as np
from unidecode import unidecode

def transliterate_text(text: str) -> str:
    f_str = unidecode(text)
    return "".join([ch for ch in f_str.lower() if ch.isalpha()])

def read_text_file(fileName: str) -> str:
    f = open(fileName, "r", encoding="utf-8")
    f_str = f.read()
    f.close()
    return transliterate_text(f_str)

def num_to_bit_indices(num: int, shifts=8) -> tuple:
    return tuple(bit for bit in range(shifts) if num & (1 << bit))

def gen_unique_bit_marks(mark_count, bit_count=8, active_bits=4) -> list:
    
    def gen_mark() -> tuple:
        mark = np.random.choice(bit_count, active_bits, replace=False)
        return tuple(sorted(mark))
    
    marks = set()
    for i in range(mark_count):
        brake = 1000
        while True:
            mark = gen_mark()
            if mark not in marks:
                marks.add(mark)
                break
            else:
                brake -= 1
                if brake == 0:
                    raise Exception("attempt limit overflow")
    return list(marks)


def letter_to_note(letter):
    return ord(letter.lower()) - ord('a')

def phrase_transpositions(phrase: list, pos: int, notation_count: int) -> list: 
    return [[(phrase[note], (i + shift) % notation_count) 
             for note, i in enumerate(range(pos, pos + len(phrase)))]
            for shift in range(notation_count)]

def text_to_noted_phrases(text: str, phrase_len: int, notation_count: int) -> list:
    note_phrases = []
    for i in range(0, len(text) - phrase_len):
        phrase = [letter_to_note(letter) for letter in text[i:i + phrase_len]]
        note_phrases.extend(phrase_transpositions(phrase, pos=i, notation_count=notation_count))
    return note_phrases

## Test functions

In [9]:
text = 'ucheniteotkrilicherazlikatasedlzhinamikrobitekoitosekriiatvnaskhorataspodeliatsamoedniisshchimikrobiavsichkiostanalisastrogoindividualni'
noted_phrases = text_to_noted_phrases(text, phrase_len=5, notation_count=5)
print(len(text), len(noted_phrases))
noted_phrases[:20]

136 655


[[(20, 0), (2, 1), (7, 2), (4, 3), (13, 4)],
 [(20, 1), (2, 2), (7, 3), (4, 4), (13, 0)],
 [(20, 2), (2, 3), (7, 4), (4, 0), (13, 1)],
 [(20, 3), (2, 4), (7, 0), (4, 1), (13, 2)],
 [(20, 4), (2, 0), (7, 1), (4, 2), (13, 3)],
 [(2, 1), (7, 2), (4, 3), (13, 4), (8, 0)],
 [(2, 2), (7, 3), (4, 4), (13, 0), (8, 1)],
 [(2, 3), (7, 4), (4, 0), (13, 1), (8, 2)],
 [(2, 4), (7, 0), (4, 1), (13, 2), (8, 3)],
 [(2, 0), (7, 1), (4, 2), (13, 3), (8, 4)],
 [(7, 2), (4, 3), (13, 4), (8, 0), (19, 1)],
 [(7, 3), (4, 4), (13, 0), (8, 1), (19, 2)],
 [(7, 4), (4, 0), (13, 1), (8, 2), (19, 3)],
 [(7, 0), (4, 1), (13, 2), (8, 3), (19, 4)],
 [(7, 1), (4, 2), (13, 3), (8, 4), (19, 0)],
 [(4, 3), (13, 4), (8, 0), (19, 1), (4, 2)],
 [(4, 4), (13, 0), (8, 1), (19, 2), (4, 3)],
 [(4, 0), (13, 1), (8, 2), (19, 3), (4, 4)],
 [(4, 1), (13, 2), (8, 3), (19, 4), (4, 0)],
 [(4, 2), (13, 3), (8, 4), (19, 0), (4, 1)]]

In [53]:
label = (0, 1)
labeled_note_phrases = [(label, noted_phrase) for noted_phrase in noted_phrases]
labeled_note_phrases[:20]

[((0, 1), [(20, 0), (2, 1), (7, 2), (4, 3), (13, 4)]),
 ((0, 1), [(20, 1), (2, 2), (7, 3), (4, 4), (13, 5)]),
 ((0, 1), [(20, 2), (2, 3), (7, 4), (4, 5), (13, 6)]),
 ((0, 1), [(20, 3), (2, 4), (7, 5), (4, 6), (13, 7)]),
 ((0, 1), [(20, 4), (2, 5), (7, 6), (4, 7), (13, 8)]),
 ((0, 1), [(20, 5), (2, 6), (7, 7), (4, 8), (13, 9)]),
 ((0, 1), [(20, 6), (2, 7), (7, 8), (4, 9), (13, 0)]),
 ((0, 1), [(20, 7), (2, 8), (7, 9), (4, 0), (13, 1)]),
 ((0, 1), [(20, 8), (2, 9), (7, 0), (4, 1), (13, 2)]),
 ((0, 1), [(20, 9), (2, 0), (7, 1), (4, 2), (13, 3)]),
 ((0, 1), [(2, 1), (7, 2), (4, 3), (13, 4), (8, 5)]),
 ((0, 1), [(2, 2), (7, 3), (4, 4), (13, 5), (8, 6)]),
 ((0, 1), [(2, 3), (7, 4), (4, 5), (13, 6), (8, 7)]),
 ((0, 1), [(2, 4), (7, 5), (4, 6), (13, 7), (8, 8)]),
 ((0, 1), [(2, 5), (7, 6), (4, 7), (13, 8), (8, 9)]),
 ((0, 1), [(2, 6), (7, 7), (4, 8), (13, 9), (8, 0)]),
 ((0, 1), [(2, 7), (7, 8), (4, 9), (13, 0), (8, 1)]),
 ((0, 1), [(2, 8), (7, 9), (4, 0), (13, 1), (8, 2)]),
 ((0, 1), [(2, 9),

## Make phrase_base

In [24]:
def load_text_base() -> dict:
    texts_dir = './data/texts/'
    file_names = {
        'bel': 'text_bel.txt',
        'blg': 'text_blg.txt',
        'eng': 'text_eng.txt',
        'epo': 'text_epo.txt',
        'jbo': 'text_jbo.txt',
        'pol': 'text_pol.txt',
        'rus': 'text_rus.txt',
        'ukr': 'text_ukr.txt'
    }
    text_base = {}
    for key, file_name in file_names.items():
        text_base[key] = read_text_file(texts_dir + file_name)
    return text_base 
    

In [25]:
text_base = load_text_base()

In [33]:
phrase_base = {}
marks_dict = {}
# bit_marks = gen_unique_bit_marks(mark_count=len(text_base.keys())) # random cross-bit code
bit_marks = [(i,) for i in range(len(text_base.keys()))] # one-shot code
for i, key in enumerate(text_base.keys()):
    bit_key = bit_marks[i]
    marks_dict[bit_key] = key
    noted_phrases = text_to_noted_phrases(text_base[key], phrase_len=5, notation_count=5)
    phrase_base[bit_key] = noted_phrases
    print(key, bit_key)

bel (0,)
blg (1,)
eng (2,)
epo (3,)
jbo (4,)
pol (5,)
rus (6,)
ukr (7,)


### Some measures

In [36]:
phrase_base[bit_marks[1]].__sizeof__()

1428208

In [32]:
marks_dict

NameError: name 'marks_dict' is not defined

In [31]:
len(text_base['ukr'])

25223

In [80]:
sum(len(phrase_base[key]) for key in phrase_base.keys())

1958320

### Load-Save

In [6]:
import pickle

def save_phrase_base(file_name: str, phrase_base: dict, marks: dict):
    with open(file_name, 'wb') as f:
        data = {
            'marks': marks, 
            'phrase_base': phrase_base
        }
        pickle.dump(data, f)

def load_phrase_base(file_name: str) -> (dict, list):
    with open(file_name, 'rb') as f:
        data = pickle.load(f)
        phrase_base = data.get('phrase_base', {})
        marks = data.get('marks', {})
        return phrase_base, marks

In [7]:
save_phrase_base('./data/texts/phrase_base.pickle', phrase_base, marks_dict)

In [8]:
phrase_base_loaded, marks_loaded = load_phrase_base('./data/texts/phrase_base.pickle')

In [9]:
print(marks_loaded)
list(phrase_base_loaded.keys())

{(0,): 'bel', (1,): 'blg', (2,): 'eng', (3,): 'epo', (4,): 'jbo', (5,): 'pol', (6,): 'rus', (7,): 'ukr'}


[(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,)]

## Make test phrase list

In [2]:
import pickle

def save_noted_phrases(file_name: str, noted_phrases: list):
    with open(file_name, 'wb') as f:
        pickle.dump(noted_phrases, f)

def load_noted_phrases(file_name: str) -> list:
    with open(file_name, 'rb') as f:
        noted_phrases = pickle.load(f)
        return noted_phrases

In [19]:
tst_text = read_text_file('./data/texts/tst_bel.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_bel.pickle', tst_noted_phrases)

In [17]:
tst_text = read_text_file('./data/texts/tst_rus.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_rus.pickle', tst_noted_phrases)

In [21]:
tst_text = read_text_file('./data/texts/tst_eng.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_eng.pickle', tst_noted_phrases)

In [3]:
tst_text = read_text_file('./data/texts/tst_ukr.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_ukr.pickle', tst_noted_phrases)

In [6]:
tst_text = read_text_file('./data/texts/tst_blg.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_blg.pickle', tst_noted_phrases)

In [8]:
tst_text = read_text_file('./data/texts/tst_jbo.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_jbo.pickle', tst_noted_phrases)

In [10]:
tst_text = read_text_file('./data/texts/tst_pol.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_pol.pickle', tst_noted_phrases)

In [12]:
tst_text = read_text_file('./data/texts/tst_epo.txt')
tst_noted_phrases = text_to_noted_phrases(tst_text, phrase_len=5, notation_count=5)
save_noted_phrases('./data/texts/tst_epo.pickle', tst_noted_phrases)

In [13]:
len(tst_noted_phrases)

3210