## Proof of Concept
- generating a regex expression
- individual has a dynamic length, can grow / shrink, encouraging growth in fitness method
- individual finds a distinct regex to produce the desired value.

In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import re
import numpy as np

print('re:', re.__version__)
print('numpy:', np.__version__)

re: 2.2.1
numpy: 1.16.3


In [4]:
import sys

sys.path.append('..')

from package.ga import BinaryGeneFactory, AbstractFitness, SimpleHillClimber
from package.transformers import IntegerToBinaryString, StringToMapping, KeyArrayToRegex

In [5]:
%%html
<h4>1. text, expected text</h4>

In [6]:
## 1. text -> '{expected string}' within,
expected_number = 0.55
text = 'ab mn gd: 0.33\n' \
     + 'cd mn gd: 0.44\n' \
     + 'de mn gd: 0.55\n' \
     + 'fg mn gd: 0.66\n' \

static_ending = r'\s([\d.]*\d[.\d]*)\b'

print(text)

ab mn gd: 0.33
cd mn gd: 0.44
de mn gd: 0.55
fg mn gd: 0.66



In [7]:
%%html
<h4>2. setup</h4>

In [8]:
consts = 'abcdefghijklmnopqrstuvwxyz'
regexes = [
    r'\s',
    r'\d',
    r'[a-z]',
    r'[:]',
    r'[!?.]',
    r'[0-9]'
]

complete_set = [ c for c in consts ] + regexes

binary_start = 0
binary_end = len(complete_set) -1 # hard end, values < binary_end
 
integer_to_binary_transformer = IntegerToBinaryString.IntegerToBinaryString(5)
gene_factory = BinaryGeneFactory.BinaryGeneFactory(binary_start, binary_end, 5)

binary_to_regex = {}
for i in range(binary_end):
    key = integer_to_binary_transformer.transform(i)
    binary_to_regex[key] = complete_set[i]

string_mapper = StringToMapping.StringToMapping(binary_to_regex)
to_regex = KeyArrayToRegex.KeyArrayToRegex(string_mapper)

In [9]:
class Fitness(AbstractFitness.AbstractFitness):
    to_regex = None
    expected_match = None
    
    def __init__(self, to_regex, expected_match, text):
        self.to_regex = to_regex
        self.expected_match = expected_match
        self.text = text
        
        super()


    previous_length = -1
    previous_fitness = -1
    
    def evaluate_genes(self, individual, display_logging = False):
        regex = ''
        fitness = 0
        length_of_individual = len(individual)
        reverse = np.flip(self.to_regex.transform_to_array(individual))
        for i, regex_item in enumerate(reverse):
            temp_regex = regex_item + regex
            
            ## encourage individual regex correctness,
            pattern = re.compile(temp_regex + static_ending, re.IGNORECASE)
            matches = pattern.findall(text)
            if len(matches) > 0:
                fitness += (( 1 - (i / length_of_individual) ) / length_of_individual)
            else:
                ## when item is wrong,
                temp_regex = '(.|\s)' + regex
            
            regex = temp_regex
            
        perfect_score = np.array([ 
            (( 1 - (i / length_of_individual) ) / length_of_individual)
            for i 
            in range(length_of_individual) ]
        ).sum()
        
        return fitness / perfect_score
    
    def evaluate_individual(self, individual, display_logging = False):
        regex = self.to_regex.transform(individual) + static_ending
        pattern = re.compile(regex, re.IGNORECASE)
        
        fitness = 0
        
        # encourage matches, but less is better.
        matches = pattern.findall(text)
        if len(matches) == 1:
            fitness += ( 1 / len(matches) )
        
        # encourage the only match being the correct match.
        match = pattern.search(text)
        if match is not None and float(match.group(1)) == self.expected_match:
            fitness += 1
            
        return fitness
        
        
    def evaluate(self, individual, display_logging = False):
        new_fitness = 0.0
        
        new_fitness += self.evaluate_genes(individual, display_logging)
        new_fitness += self.evaluate_individual(individual, display_logging)
        
        new_length = len(individual)
        previous_fitness = self.previous_fitness
        self.previous_fitness = new_fitness
        
        if previous_fitness == new_fitness:
            if self.previous_length <= new_length:
                new_fitness += 1 # encourage growth over shrinking,
        
        self.previous_length = new_length
        
        return new_fitness / 4
    
fitness_evaluator = Fitness(to_regex, expected_number, text)

In [10]:
def gene_mutator(gene, display_logging = False):
    precentage = np.random.rand()
    if precentage < .08:
        new_gene = gene_factory.create()
        gene = new_gene
        
    return gene

def individual_height_mutator(individual, display_logging = False):
    precentage = np.random.rand()
    if precentage < .10:
        gene = gene_factory.create()
        individual = [gene] + individual # grow to the left,
            
    length = len(individual)
    if precentage > .90 and length > 0:
        individual = individual[1:] # remove from the left,
        
    return individual

hill_climber = SimpleHillClimber.SimpleHillClimber(fitness_evaluator, [ gene_mutator ], [ individual_height_mutator ])

In [11]:
%%html
<h4>3. create individual</h4>

In [12]:
individual = gene_factory.create_many(12)

print('binary:', '|'.join(individual))
print('regex: ', '/'+ ''.join(to_regex.transform_and_compress(individual)) + static_ending + '/gimu')

binary: 00111|01010|10000|10100|01011|01101|01110|01101|10001|01011|10110|00001
regex:  /hkqulnonrlwb\s([\d.]*\d[.\d]*)\b/gimu


In [13]:
%%html
<h4>4. run</h4>

In [14]:
number_of_iterations = 10000
result = hill_climber.run(individual, number_of_iterations, False)

final_individual = result[0]
final_fitness = result[1]
final_iteration = result[2]

print(
    'compressed:',
    '/' + to_regex.transform_and_compress(final_individual) + static_ending + '/gimu',
    '~',
    final_fitness,
    '~',
    final_iteration
)

print(
    'original:',
    '/' + to_regex.transform_and_compress(individual) + '/gimu',
)

compressed: /[a-z]e\s[a-z]n\s[a-z]d[:]\s([\d.]*\d[.\d]*)\b/gimu ~ 1.0 ~ 9188
original: /hkqulnonrlwb/gimu


In [15]:
match = re.compile(
    to_regex.transform_and_compress(final_individual) + static_ending,
    re.IGNORECASE
).search(text)

if match is not None:
    print(match.group(0), '=>', match.group(1))

de mn gd: 0.55 => 0.55
