## Overview

In [1]:
#0. Load training set
import pandas as pd
train_data = pd.read_csv('datasets/iris.csv')
train_labels = train_data.pop('s')

# 1. Import
from karoo_gp import Terminals, Operators, Model

#2. Initialize Operators and Terminals
operators = Operators.arithmetic()
terminals = Terminals(train_data.keys(), constants=[.1, .2, .3, .4, .5])

#3. Initialize Model
model = Model(operators, terminals)

#4. Train Model
model.train(train_data, train_labels)

#5. Inspect
model.fittest()

  return self.operators[type(node.op)](


<Tree: '0.2*pl**2/sw ...' fitness: 37.33>

## Classes Detail

### Heirarchy
```
Model
|-Solver
|-Population
   |-Terminals
   |  |-Terminal
   |
   |-Operators
   |  |-Operator
   |
   |-Trees
      |-Branch
         |-Operator/Terminal
         |-Branch (recursive)
```


### Operator
- A tree node/gene which is an instruction
  
### Operators
- An Operator set
- Initialize from a list `Operator(['+', '-'])` or with constructor `Operator.logic()`
- Access elements with `op.get()`

In [2]:
from karoo_gp import Operator

# Instantiate a single element
op = Operator('+')
print(op)
print(op.symbol, op.op_type, op.arity)

from karoo_gp import Operators

# Generate a pre-loaded set
ops = Operators.arithmetic() # .logic() .math()
print(ops)
print(ops.get())

# Generate a custom set
ops = Operators(['*', '/'])
print(ops.operators[-1])

<Operator: +(arithmetic)>
+ arithmetic 2
<Operators: 5(+-*/**)>
[<Operator: +(arithmetic)>, <Operator: -(arithmetic)>, <Operator: *(arithmetic)>, <Operator: /(arithmetic)>, <Operator: **(arithmetic)>]
<Operator: /(arithmetic)>


### Terminal
- A tree node/gene which is a variable or constant
  
### Terminals
- An Terminal set
- Initialize with list of terminals & optional list of constants
- Access elements with `term.get()`

In [3]:
from karoo_gp import Terminal, Terminals

term = Terminal('a')
print(term)
print(term.symbol, term.t_type)

terms = Terminals(['a', 'b'], constants=[1, 2])
print(terms)
print(terms.variables['a'])
print(terms.constants[0])
print(terms.get())

<Terminal: a(<class 'float'>)>
a <class 'float'>
<Terminals: 4(ab, 12)>
<Terminal: a(<class 'float'>)>
<Terminal: 1(<class 'int'>)>
[<Terminal: a(<class 'float'>)> <Terminal: b(<class 'float'>)>
 <Terminal: 1(<class 'int'>)> <Terminal: 2(<class 'int'>)>]


### Branch
- Contains 1 node (Operator or Terminal) & optional parent, children
- Recursive methods: `generate()`, `parse()`, `n_children()`, `get_child()`, `mutate()` ...

In [4]:
from karoo_gp import Operators, Terminals, Branch
import numpy as np

rng = np.random.RandomState(1111)
operators = Operators(['+', '-', '*', '/'])
terminals = Terminals(['a', 'b', 'c'], constants=[.1, .2, .3])
tree_type = 'g'
depth = 3
branch = Branch.generate(rng, operators, terminals, tree_type=tree_type, depth=depth)
print(branch)
print(branch.parse())
print(branch.n_children())

branch.mutate(rng, operators, terminals, recursive=False)
print(branch.parse())

random_child = rng.randint(1, branch.n_children())
child, _ = branch.get_child(random_child)
child.mutate(rng, operators, terminals, recursive=True)
print(branch.parse())

<Branch: <Operator: +(arithmetic)>>
0.2+0.2-0.2*c+a
8
0.2+0.2-0.2*c-a
0.2+0.2-0.2*0.3-a


### Tree
- A wrapper a root branch of arbitrary depth
- Attributes:
  - root (Branch)
  - fitness (float)
- Methods:
  - Initialize: `generate()`, `save()`, `load()`
  - Display: `parse()`, `sym()`, `depth()`
  - Evolve: `point_mutate`, `full_mutate`, `crossover`

In [5]:
from karoo_gp import Operators, Terminals, Tree

rng = np.random.RandomState(12)
operators = Operators(['+', '-', '*', '/'])
terminals = Terminals(['a', 'b', 'c'], constants=[.1, .2, .3])
params = {'depth': 3, 'tree_type': 'f'}
tree = Tree.generate(rng, operators, terminals, params)

# Display methods
print(tree)
print(tree.sym())
print(tree.depth())
print(tree.parse())

# Mutate one node or one sub-branch
tree.point_mutate(rng, operators, terminals)
print(tree.parse())
tree.full_mutate(rng, operators, terminals)
print(tree.parse())

# Crossover with another tree
mate = Tree.generate(rng, operators, terminals, params)
print(mate.parse())
offspring = tree.crossover(rng, mate)
print(offspring.parse())

<Tree: '0.8 - b'>
0.8 - b
4
0.3*b/0.1*0.1/b+0.2+0.3-b
0.3/b/0.1*0.1/b+0.2+0.3-b
0.3/b/c*0.1/b+0.2+0.3-b
0.2+0.3-b*0.1+0.1+b*a*c
0.1/b/c*0.1/b+0.2+0.3-b


### Population
- A wrapper for a group of Trees
- 'Layer' methods: `fitness`, `tournament`, `evolve`, `cull`

In [6]:
from karoo_gp import Operators, Terminals, Population, NumpySolver

train_data = pd.read_csv('datasets/iris.csv')
train_labels = train_data.pop('s')

rng = np.random.RandomState(12)
operators = Operators(['+', '-', '*', '/'])
terminals = Terminals(['a', 'b', 'c'], constants=[.1, .2, .3])
params = {'population_size': 100, 
          'criteria': 'fitness', 
          'depth': 3, 
          'tree_type': 'g'}

# Initialize
population = Population.generate(rng, operators, terminals, params)
print(population)
print(population.operators, population.terminals)
print(len(population.trees), population.trees[0])

# Evolve
solver = NumpySolver()
population.cull(min_nodes=8, max_depth=5)  # cull: remove unfit trees
print(len(population.trees))
population.fitness(solver, train_data, train_labels)  # fitness: score trees against data
print(population.trees[0])
print(population.tournament(rng, tournament_size=7))
population.evolve(rng, tournament_size=7)  # evolve: create new generation from trees
population.cull(min_nodes=8, max_depth=5)
population.fitness(solver, train_data, train_labels)
print(population.trees[0])

<Population: 100 trees>
<Operators: 4(+-*/)> <Terminals: 6(abc, 0.10.)>
100 <Tree: '0.1 - 0.66666...'>
63
<Tree: '0.3*c - 0.14' fitness: 47.17>
<Tree: '0.03*c' fitness: 137.48>
<Tree: '0.01*a*c**2' fitness: 41.54>


### Model
- Methods:
  - `train()` - compile params into layers, pass to process
  - `process()` - execute a list of layers on population
  - `fittest()` - return the best-performing tree

In [7]:
from karoo_gp import Operators, Terminals, Model
import pandas as pd

train_data = pd.read_csv('datasets/iris.csv')
train_labels = train_data.pop('s')

operators = Operators(['+', '-', '*', '/'])
terminals = Terminals(train_data.keys(), constants=[1, 2, 3])
params = {'depth': 3,
          'tree_type': 'r',
          'population_size': 100,
          'criteria': 'fitness'}
    
# Initialize a model
model = Model(operators, terminals, params=params, solver='numpy', seed=1234)
print(model)
print(model.population)
print(model.fittest())

# Train against dataset
params = dict(generations=3,
              tournament_size=7,
              min_nodes=5,
              max_depth=5)
model.train(train_data, train_labels, **params)
print(model.fittest())

# Process custom layers
layers = [('evolve', (model.rng, params['tournament_size'])),
          ('cull', (params['min_nodes'], params['max_depth'])),
          ('fitness', (model.solver, train_data, train_labels))]
model.process(layers)
print(model.fittest())

<Model: <Population: 100 trees> Fittest: <Tree: '-3*pl/pw + 1 ...'>>
<Population: 100 trees>
<Tree: '-3*pl/pw + 1 ...'>
<Tree: 'pw' fitness: 39.9>
<Tree: 'pw**2/2' fitness: 36.04>


## Playground

In [9]:
import sys, time
import pandas as pd
from karoo_gp import Terminals, Operators, Model


# Helper
last_time = time.perf_counter()
def _log_time():
    global last_time
    now = time.perf_counter()
    delta = round(now-last_time, 5)
    last_time = now
    return delta


# Load Data
train_data = pd.read_csv('datasets/cancer.csv')
# train_data = pd.read_csv('datasets/data_kepler.csv')
# train_data = pd.read_csv('datasets/diabetes.csv')
# train_data = pd.read_csv('datasets/iris.csv')
# train_data = pd.read_csv('datasets/wine.csv')
train_labels = train_data.pop('s')
operators = Operators(['+', '-', '*', '/'])
terminals = Terminals(train_data.keys(), constants=[.1, .2, .3, .4, .5])


# Initialize Model
params = dict(depth=5,
              tree_type='r',
              population_size=100,
              criteria='fitness')
model = Model(operators, terminals, params, solver='Numpy', seed=666)
print(f"Initialized model in {_log_time()}s")


# Train
evolution_params = dict(duplicate=0.1,
                        point_mutate=0.1,
                        branch_mutate=0.2,
                        crossover=0.6)
params = dict(generations=10,
              tournament_size=7,
              min_nodes=5,
              max_depth=7,
              evolution_params=evolution_params)
for i in range(3):
    model.train(train_data, train_labels, **params)
    print(f"Gen {(i+1)*params['generations']}: {_log_time()}s: {model.fittest()}")

    
# Summarize    
print("Cache size:", sys.getsizeof(model.solver.cache)/1e6, 'mb')
print("Fittest:", model.fittest().parse())
model.fittest().sym()

Initialized model in 0.28646s
Gen 10: 3.91657s: <Tree: '-2*meaconpoi ...' fitness: 164.29>
Gen 20: 3.20326s: <Tree: '-2*meaconpoi ...' fitness: 150.03>
Gen 30: 3.21846s: <Tree: '-3*meaconpoi ...' fitness: 148.69>
Cache size: 0.073816 mb
Fittest: 0.2-meaconpoi-worcon+0.4-meaconpoi+0.4+measym-meaconpoi


-3*meaconpoi + measym - worcon + 1.0