## Proof of Concept
- JUST generating a regex expression to match a particular string

In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import re
import numpy as np

print('re:', re.__version__)
print('numpy:', np.__version__)

re: 2.2.1
numpy: 1.16.3


In [4]:
import sys

sys.path.append('..')

from package.ga import BinaryGeneFactory
from package.transformer import IntegerToBinaryString, StringToMapping, KeyArrayToRegex

In [5]:
expected_match = 'backstreets back   11:05:20   alright'
length_of_expected_match = len(expected_match)

In [6]:
binary_start = 0
binary_end = 3 + 1

gene_factory = BinaryGeneFactory.BinaryGeneFactory(2)
string_mapper = StringToMapping.StringToMapping({
    '00': r'\s',
    '01': r'\d',
    '10': r'[a-z]',
    '11': r'[:]'
})
to_regex = KeyArrayToRegex.KeyArrayToRegex(string_mapper)

In [7]:
def evaluate_fitness(individual):
    transformed = to_regex.transform_to_array(individual)
    return np.array(
        [ 
            re.match(ai, expected_match[i]) != None 
            for i, ai 
            in enumerate(transformed) 
        ]
    ).astype(int).sum() / length_of_expected_match

In [8]:
individual = gene_factory.create_many(0, 4, length_of_expected_match)

print('binary:', '|'.join(individual))
print('regex: ', ''.join(to_regex.transform_and_compress(individual)))

binary: 00|00|00|00|10|00|10|00|11|01|00|11|00|10|11|11|00|11|01|11|00|01|10|01|10|11|11|00|10|11|00|00|11|01|01|01|01
regex:  \s+[a-z]\s[a-z]\s[:]\d\s[:]\s[a-z][:]+\s[:]\d[:]\s\d[a-z]\d[a-z][:]+\s[a-z][:]\s+[:]\d+


In [9]:
log_progress = False
number_of_iterations = 5000

for iteration in range(number_of_iterations):
    fitness = evaluate_fitness(individual)
    
    if log_progress:
        print('iteration:', iteration, '=', fitness)
    
    if (fitness >= 1):
        ## finished early,
        break
        
    new_individual = []
    for gene in individual:
        precentage = np.random.rand()
        if precentage < .08:
            gene = gene_factory.create(0, 4)
        
        new_individual.append(gene)
    
    if evaluate_fitness(new_individual) > fitness:
        individual = new_individual

In [10]:
print(
    'compressed:',
    '/' + to_regex.transform_and_compress(individual) + '/gimu',
    '~',
    '"' + expected_match + '"',
    '~',
    evaluate_fitness(individual)
)

compressed: /[a-z]+\s[a-z]+\s+\d+[:]\d+[:]\d+\s+[a-z]+/gimu ~ "backstreets back   11:05:20   alright" ~ 1.0
