## Proof of Concept
- JUST generating a regex expression to match a particular string

In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import re
import numpy as np

print('re:', re.__version__)
print('numpy:', np.__version__)

re: 2.2.1
numpy: 1.16.3


In [4]:
expected_match = 'backstreets back   11:05:20   alright'
length_of_expected_match = len(expected_match)

binary_to_regex_dict = {
    '00': r'\s',
    '01': r'\d',
    '10': r'[a-z]',
    '11': r'[:]'
}

binary_start = 0
binary_end = 3 + 1

expected_binary_length = 2

In [5]:
def get_random_gene():
    integer = np.random.randint(binary_start, binary_end)
    return "0{0:b}".format(integer)[-expected_binary_length:]

def random_individual(length):
    return np.array([ get_random_gene() for _ in range(length) ])

def transform_to_regex(P):
    return [ binary_to_regex_dict[''.join(row)] for row in P ]

def evaluate(P):
    return np.array([ re.match(a, expected_match[i]) != None for i, a in enumerate(transform_to_regex(P)) ]).astype(int).sum() / length_of_expected_match

def compress_expression(items):
    previous_item = ''
    number_of_times_called = 0
    compressed_expression = ''
    for item in transform_to_regex(individual):
        if previous_item == item:
            number_of_times_called += 1
            continue

        is_not_first_entry = previous_item != ''
        is_new_entry = is_not_first_entry and previous_item != item
        if is_new_entry:
            compressed_expression += previous_item
            
            if number_of_times_called > 0:
                compressed_expression += '+'
            
            number_of_times_called = 0

        previous_item = item

    compressed_expression += previous_item
    if number_of_times_called > 0:
        compressed_expression += '+'

    return compressed_expression

In [6]:
individual = random_individual(length_of_expected_match)

print('|'.join(individual))
print()
print('translation:', '/' + compress_expression(transform_to_regex(individual)) + '/gimu')
print('score:', evaluate(individual))
print()

11|11|11|11|00|10|11|00|00|00|11|00|11|11|11|00|10|01|01|11|01|01|01|00|10|10|01|01|01|00|00|00|10|11|11|11|00

translation: /[:]+\s[a-z][:]\s+[:]\s[:]+\s[a-z]\d+[:]\d+\s[a-z]+\d+\s+[a-z][:]+\s/gimu
score: 0.1891891891891892



In [7]:
number_of_iterations = 5000

for iteration in range(number_of_iterations):
    b_ = evaluate(individual)
    print('iteration:', iteration, '=', b_)
    
    if (b_ >= 1):
        break
        
    new_individual = []
    for gene in individual:
        precentage = np.random.rand()
        if precentage < .08:
            new_gene = get_random_gene()
            print('\t- mutation:', gene, '->', new_gene, '@', precentage)
            gene = new_gene
        
        new_individual.append(gene)
    
    if evaluate(new_individual) > b_:
        individual = new_individual

iteration: 0 = 0.1891891891891892
	- mutation: 10 -> 00 @ 0.0764489367164306
	- mutation: 11 -> 00 @ 0.003544918730833513
	- mutation: 01 -> 01 @ 0.0627851141324155
	- mutation: 01 -> 11 @ 0.02243722847129015
iteration: 1 = 0.1891891891891892
	- mutation: 11 -> 10 @ 0.027553782235609114
	- mutation: 00 -> 11 @ 0.05889572994882597
iteration: 2 = 0.21621621621621623
	- mutation: 01 -> 00 @ 0.020747184736019775
iteration: 3 = 0.21621621621621623
	- mutation: 00 -> 01 @ 0.0024518921040099784
	- mutation: 10 -> 10 @ 0.020095481477815813
	- mutation: 11 -> 11 @ 0.06475491005185463
	- mutation: 11 -> 01 @ 0.021967812212981164
iteration: 4 = 0.21621621621621623
	- mutation: 11 -> 10 @ 0.01803518061745557
	- mutation: 10 -> 01 @ 0.04248122844187685
	- mutation: 11 -> 01 @ 0.06811862076022279
	- mutation: 01 -> 11 @ 0.002315619272616365
iteration: 5 = 0.24324324324324326
	- mutation: 11 -> 01 @ 0.042671447215887226
iteration: 6 = 0.24324324324324326
	- mutation: 11 -> 01 @ 0.03155954536554373
	-

	- mutation: 00 -> 11 @ 0.014564997370386967
	- mutation: 10 -> 01 @ 0.012363803993525213
iteration: 78 = 0.7027027027027027
	- mutation: 10 -> 01 @ 0.07715218301929061
	- mutation: 10 -> 11 @ 0.003533549013422843
	- mutation: 10 -> 10 @ 0.043335814760568026
	- mutation: 10 -> 00 @ 0.04737216269069433
	- mutation: 01 -> 10 @ 0.07893954316746188
	- mutation: 11 -> 01 @ 0.02220007419692782
	- mutation: 01 -> 11 @ 0.01569876174369611
iteration: 79 = 0.7027027027027027
	- mutation: 10 -> 10 @ 0.03219979859737687
	- mutation: 10 -> 11 @ 0.048138807052152344
iteration: 80 = 0.7027027027027027
	- mutation: 00 -> 11 @ 0.043872516084833335
	- mutation: 00 -> 01 @ 0.005412841622590925
	- mutation: 10 -> 11 @ 0.07799197403578895
	- mutation: 10 -> 11 @ 0.007154110111457168
iteration: 81 = 0.7027027027027027
	- mutation: 10 -> 10 @ 0.004513368619678437
	- mutation: 10 -> 01 @ 0.06920566704200914
	- mutation: 01 -> 01 @ 0.07817756015130584
	- mutation: 01 -> 11 @ 0.05134756755712866
	- mutation: 10

	- mutation: 00 -> 10 @ 0.025748729774077717
	- mutation: 00 -> 10 @ 0.07408835475253062
iteration: 173 = 0.8108108108108109
	- mutation: 01 -> 10 @ 0.04826591583452167
iteration: 174 = 0.8108108108108109
	- mutation: 10 -> 10 @ 0.06199033757614414
	- mutation: 10 -> 00 @ 0.040997212408590555
	- mutation: 11 -> 01 @ 0.035174989334567086
	- mutation: 00 -> 00 @ 0.0027086721283650084
	- mutation: 00 -> 01 @ 0.07797873701200142
iteration: 175 = 0.8108108108108109
	- mutation: 01 -> 10 @ 0.06056063055575234
iteration: 176 = 0.8108108108108109
	- mutation: 10 -> 10 @ 0.07186449758359004
	- mutation: 10 -> 00 @ 0.029097888419104656
	- mutation: 11 -> 10 @ 0.03185501655733469
	- mutation: 00 -> 10 @ 0.027034363803277617
iteration: 177 = 0.8378378378378378
	- mutation: 00 -> 01 @ 0.012049962123204194
	- mutation: 01 -> 01 @ 0.07599376610140363
	- mutation: 01 -> 10 @ 0.05918996857975167
	- mutation: 11 -> 11 @ 0.0011678253821864049
iteration: 178 = 0.8378378378378378
	- mutation: 10 -> 00 @ 0.

iteration: 271 = 0.918918918918919
	- mutation: 00 -> 10 @ 0.03828524196541694
	- mutation: 10 -> 11 @ 0.0064911752619704055
	- mutation: 01 -> 01 @ 0.07651620912089308
	- mutation: 00 -> 10 @ 0.05064012219513625
	- mutation: 01 -> 01 @ 0.07402447440137361
iteration: 272 = 0.918918918918919
	- mutation: 01 -> 10 @ 0.025958514090648865
iteration: 273 = 0.918918918918919
	- mutation: 10 -> 01 @ 0.015473426985022742
	- mutation: 01 -> 01 @ 0.07859760032393659
iteration: 274 = 0.918918918918919
	- mutation: 10 -> 11 @ 0.016806888636383133
	- mutation: 00 -> 10 @ 0.0748031590688194
	- mutation: 10 -> 01 @ 0.0593793295041668
iteration: 275 = 0.918918918918919
	- mutation: 10 -> 00 @ 0.04069436644309121
	- mutation: 10 -> 01 @ 0.06395135689620346
	- mutation: 01 -> 10 @ 0.006898697144925081
	- mutation: 10 -> 10 @ 0.015701575216675923
iteration: 276 = 0.918918918918919
	- mutation: 10 -> 01 @ 0.06703805604546664
	- mutation: 10 -> 01 @ 0.05573163282257232
	- mutation: 10 -> 11 @ 0.03571631746

	- mutation: 00 -> 10 @ 0.07683088628270762
iteration: 372 = 0.918918918918919
	- mutation: 01 -> 00 @ 0.04213306847675946
	- mutation: 01 -> 10 @ 0.05911999697191561
	- mutation: 10 -> 10 @ 0.03698035411310541
iteration: 373 = 0.918918918918919
	- mutation: 10 -> 11 @ 0.060743327232365196
	- mutation: 01 -> 10 @ 0.02809700969750928
iteration: 374 = 0.918918918918919
	- mutation: 10 -> 01 @ 0.036330025503730545
	- mutation: 01 -> 00 @ 0.04253386524072966
iteration: 375 = 0.918918918918919
	- mutation: 10 -> 10 @ 0.015597453699748298
	- mutation: 10 -> 01 @ 0.07065877430792977
iteration: 376 = 0.918918918918919
	- mutation: 00 -> 01 @ 0.016655441216306688
	- mutation: 10 -> 01 @ 0.0677170777001731
	- mutation: 10 -> 10 @ 0.04229950142003158
iteration: 377 = 0.918918918918919
	- mutation: 01 -> 11 @ 0.019625575944619555
	- mutation: 00 -> 10 @ 0.06251480060575831
	- mutation: 01 -> 01 @ 0.06636604436692117
	- mutation: 10 -> 10 @ 0.005347570596019091
	- mutation: 10 -> 11 @ 0.07350840459

	- mutation: 10 -> 01 @ 0.05866380206359134
iteration: 477 = 0.918918918918919
	- mutation: 00 -> 11 @ 0.015038854226592124
	- mutation: 01 -> 01 @ 0.035277514473008864
iteration: 478 = 0.918918918918919
iteration: 479 = 0.918918918918919
	- mutation: 00 -> 00 @ 0.06815885219301332
	- mutation: 00 -> 00 @ 0.06260033724902136
	- mutation: 10 -> 11 @ 0.01028344162705308
iteration: 480 = 0.918918918918919
	- mutation: 10 -> 10 @ 0.02347337071956035
	- mutation: 10 -> 01 @ 0.01678056027527386
	- mutation: 10 -> 11 @ 0.0038372763407833466
iteration: 481 = 0.918918918918919
	- mutation: 10 -> 01 @ 0.0033062739414986275
	- mutation: 01 -> 01 @ 0.033420488868196485
	- mutation: 01 -> 11 @ 0.0064391567237012115
	- mutation: 10 -> 10 @ 0.03265235374115438
iteration: 482 = 0.918918918918919
	- mutation: 10 -> 11 @ 0.07325446750380427
iteration: 483 = 0.918918918918919
	- mutation: 10 -> 01 @ 0.032781619702803355
	- mutation: 00 -> 00 @ 0.016281796672067084
	- mutation: 10 -> 11 @ 0.01347321246868

iteration: 577 = 0.9459459459459459
	- mutation: 10 -> 10 @ 0.06682724084906466
	- mutation: 10 -> 01 @ 0.009962238711443194
	- mutation: 10 -> 01 @ 0.013161418595235452
iteration: 578 = 0.9459459459459459
	- mutation: 10 -> 00 @ 0.053980581967913444
	- mutation: 01 -> 11 @ 0.062265779285367584
	- mutation: 11 -> 00 @ 0.00877651152261194
iteration: 579 = 0.9459459459459459
	- mutation: 10 -> 11 @ 0.03262648501594034
	- mutation: 10 -> 11 @ 0.06581974477826691
	- mutation: 10 -> 01 @ 0.00796815942718776
	- mutation: 00 -> 11 @ 0.07872930226441177
	- mutation: 01 -> 11 @ 0.07007548605305558
	- mutation: 01 -> 01 @ 0.0305975858021883
	- mutation: 10 -> 01 @ 0.017515935785065162
iteration: 580 = 0.9459459459459459
	- mutation: 10 -> 01 @ 0.03086315008587992
	- mutation: 00 -> 00 @ 0.06511814670080374
	- mutation: 11 -> 11 @ 0.05502461808621684
	- mutation: 10 -> 11 @ 0.06594564444841433
iteration: 581 = 0.9459459459459459
	- mutation: 10 -> 01 @ 0.025687969478820682
	- mutation: 10 -> 01 @

	- mutation: 10 -> 11 @ 0.04503007933844905
iteration: 681 = 0.972972972972973
iteration: 682 = 0.972972972972973
	- mutation: 01 -> 11 @ 0.02571571297252606
	- mutation: 10 -> 11 @ 0.028301597692161118
iteration: 683 = 0.972972972972973
	- mutation: 00 -> 00 @ 0.060697129771447034
	- mutation: 10 -> 10 @ 0.01578193395852867
iteration: 684 = 0.972972972972973
	- mutation: 10 -> 01 @ 0.05311007214721175
	- mutation: 01 -> 11 @ 0.034629686370954316
	- mutation: 00 -> 11 @ 0.0267904633021524
iteration: 685 = 0.972972972972973
	- mutation: 10 -> 00 @ 0.04350756268130651
	- mutation: 11 -> 00 @ 0.07776364600219454
	- mutation: 10 -> 11 @ 0.06179078697731688
iteration: 686 = 0.972972972972973
	- mutation: 10 -> 00 @ 0.0447704622793218
	- mutation: 10 -> 01 @ 0.07700520958104917
iteration: 687 = 0.972972972972973
	- mutation: 10 -> 10 @ 0.017329818219343207
	- mutation: 11 -> 00 @ 0.017673108412823324
	- mutation: 01 -> 11 @ 0.02457384581613653
	- mutation: 10 -> 00 @ 0.05923180760754532
iter

In [8]:
print(
    'compressed:',
    '/' + compress_expression(transform_to_regex(individual)) + '/gimu',
    '~',
    '"' + expected_match + '"',
    '~',
    evaluate(individual)
)

compressed: /[a-z]+\s[a-z]+\s+\d+[:]\d+[:]\d+\s+[a-z]+/gimu ~ "backstreets back   11:05:20   alright" ~ 1.0
