# GP Multiple runs on an arbitrary Classification Problem.

In this notebook we tackle classification with GP. It isn't that straightforward to create a notebook that is general enough to be applied to an classification problem as some will have different types. For now, we're going to focus on binary classification with numeric types, and will look at mixed types when we move to Grammatical Evolution.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


Install DEAP. 

In [2]:
!pip install deap

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


Import our tools. 

In [3]:
import random
import operator
import csv
import itertools
import math

import numpy

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

import csv
from elitism import eaSimpleWithElitism


import matplotlib.pyplot as plt

import networkx as nx



Set our Genetic Programming parameters, one of which is now the number of runs.

In [4]:
# Genetic Programming constants:
POPULATION_SIZE = 200
P_CROSSOVER = 0.9
P_MUTATION = 0.01
MAX_GENERATIONS = 50
HALL_OF_FAME_SIZE = 10

N_RUNS = 30




Set the random seed. 

In [5]:
RANDOM_SEED = 412
random.seed(RANDOM_SEED)

GP-Specific constants.

In [6]:
MIN_TREE_HEIGHT = 5
MAX_TREE_HEIGHT = 12
LIMIT_TREE_HEIGHT = 17
MUT_MIN_TREE_HEIGHT = 0
MUT_MAX_TREE_HEIGHT = 2

Read in the data.

In [7]:
with open("spambase.csv") as classificationData:
    n_rows = sum(1 for line in classificationData)
with open("spambase.csv") as classificationData:
    reader = csv.reader(classificationData)
    data = list(list(float(elem) for elem in row) for row in reader)

This is a helpful function to turn numeric values into Boolean. We're going to assume that a value greater than 0 is **True** and anything else is **False**.

In [8]:
def turnBool(pred):
    return (pred>0)

Define our fitness function. This time there's an extra step. First we get all our predictions, which will be in numeric form, then we convert them to Boolean, and only then do we evaluate the fitness.

In [9]:
def evalClassification(individual):
    # Transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    #predictions = map(func,data)
    spam_samp = random.sample(data, 400)
    result = sum(bool(func(*mail[:57])) is bool(mail[57]) for mail in spam_samp)
    nodes, edges, labels = gp.graph(individual)
    return result, len(nodes)

Define a protected division function.

In [10]:
def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

Add our functions and terminals. 

In [11]:
pset = gp.PrimitiveSet("MAIN", 57) # number of inputs!!!
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protectedDiv, 2)
pset.addPrimitive(operator.neg, 1)

pset.addEphemeralConstant("rand101", lambda: random.random())


Create our toolbox. This is very similar to the Symbolic Regression notebook except we are using the parameters declared up above.

In [12]:
toolbox = base.Toolbox()

#creator.create("FitnessMin", base.Fitness, weights=(-1.0,-1.0))
#creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)


toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=MIN_TREE_HEIGHT, max_=MAX_TREE_HEIGHT)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

toolbox.register("evaluate", evalClassification)
#toolbox.register("select", tools.selNSGA2)
toolbox.register("select", tools.selTournament, tournsize=5)

toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=5)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

Create our statistics. These are a bit more complex than the GA ones because we want to keep track of fitness and size for all runs.

In [13]:
maxListFitness = []
avgListFitness = []
minListFitness = []
stdListFitness = []

maxListSize = []
avgListSize = []
minListSize = []
stdListSize = []

Conduct **N_RUNS** times. 

In [None]:
for r in range(0, N_RUNS):
    population = toolbox.population(n=POPULATION_SIZE)
    # define the hall-of-fame object:
    hof = tools.HallOfFame(HALL_OF_FAME_SIZE)   


    # Create our statistics
    stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
    stats_size = tools.Statistics(len)
    mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    mstats.register("avg", numpy.mean)
    mstats.register("std", numpy.std)
    mstats.register("min", numpy.min)
    mstats.register("max", numpy.max)
    
    
    # Which run are we on?
    print("\n\nCurrently on run", r, "of",N_RUNS)
    
    
    # It's usually a good idea to turn off verbose when conducting multiple runs
    population, logbook = eaSimpleWithElitism(population,
                                                  toolbox,
                                                  cxpb=P_CROSSOVER,
                                                  mutpb=P_MUTATION,
                                                  ngen=MAX_GENERATIONS,
                                                  stats=mstats,
                                                  halloffame=hof,
                                                  verbose=False)
    
    #maxFitnessValues, meanFitnessValues = logbook.chapters['fitness'].select("min", "avg")
    meanFitnessValues, stdFitnessValues, minFitnessValues, maxFitnessValues  = logbook.chapters['fitness'].select("avg", "std", "min", "max")
    meanSizeValues, stdSizeValues, minSizeValues, maxSizeValues  = logbook.chapters['size'].select("avg", "std", "min", "max")


    # Save statistics for this run:
    avgListFitness.append(meanFitnessValues)
    stdListFitness.append(stdFitnessValues)
    minListFitness.append(minFitnessValues)
    maxListFitness.append(maxFitnessValues)
    
    avgListSize.append(meanSizeValues)
    stdListSize.append(stdSizeValues)
    minListSize.append(minSizeValues)
    maxListSize.append(maxSizeValues)

    # print info for best solution found:
    best = hof.items[0]
    print("-- Best Individual = ", best)
    print("-- length={}, height={}".format(len(best), best.height))
    print("-- Best Fitness = ", best.fitness.values[0])





Currently on run 0 of 30
-- Best Individual =  mul(neg(add(ARG31, mul(mul(mul(mul(add(add(ARG23, ARG51), ARG51), protectedDiv(mul(protectedDiv(mul(neg(ARG15), protectedDiv(ARG49, 0.19661718882847368)), ARG40), protectedDiv(ARG49, neg(ARG32))), ARG40)), protectedDiv(ARG23, ARG40)), protectedDiv(ARG23, ARG40)), protectedDiv(ARG52, ARG40)))), sub(add(sub(add(ARG52, ARG39), sub(ARG39, ARG52)), mul(sub(ARG55, mul(add(ARG23, ARG51), protectedDiv(ARG23, ARG40))), ARG23)), add(mul(sub(ARG55, protectedDiv(mul(sub(ARG55, sub(ARG1, add(sub(add(ARG37, ARG31), sub(ARG39, ARG52)), mul(add(ARG23, ARG51), protectedDiv(ARG23, ARG40))))), ARG22), ARG40)), mul(sub(add(ARG40, ARG31), sub(mul(add(sub(add(ARG37, ARG31), mul(sub(ARG55, ARG52), ARG51)), ARG52), protectedDiv(ARG49, 0.19661718882847368)), ARG52)), ARG23)), ARG15)))
-- length=108, height=12
-- Best Fitness =  368.0


Currently on run 1 of 30
-- Best Individual =  neg(mul(add(mul(protectedDiv(ARG22, ARG25), add(add(mul(protectedDiv(mul(mul(ARG1

-- Best Individual =  add(neg(mul(add(ARG52, add(add(ARG52, add(neg(neg(mul(mul(ARG21, ARG54), add(ARG24, neg(ARG22))))), ARG15)), neg(neg(mul(add(add(ARG15, ARG22), add(ARG22, ARG15)), add(ARG51, mul(add(ARG52, add(mul(mul(ARG21, ARG54), neg(mul(add(add(ARG6, ARG15), add(ARG9, ARG22)), add(ARG51, neg(ARG52))))), ARG15)), add(ARG51, ARG22)))))))), add(ARG51, ARG51))), ARG6)
-- length=61, height=17
-- Best Fitness =  371.0


Currently on run 14 of 30
-- Best Individual =  protectedDiv(sub(mul(protectedDiv(neg(mul(ARG3, ARG3)), neg(ARG24)), add(add(ARG52, add(ARG15, ARG6)), add(ARG15, ARG6))), neg(neg(mul(ARG3, ARG53)))), add(sub(add(mul(ARG5, ARG5), ARG52), add(mul(add(sub(neg(ARG18), neg(neg(0.7017024104430308))), ARG7), add(sub(ARG7, ARG24), ARG1)), mul(ARG52, neg(ARG18)))), 0.17171326449514446))
-- length=51, height=9
-- Best Fitness =  371.0


Currently on run 15 of 30
-- Best Individual =  mul(mul(add(neg(protectedDiv(ARG0, protectedDiv(mul(ARG23, ARG39), ARG26))), protectedDiv(ARG

-- Best Individual =  neg(neg(mul(neg(add(sub(mul(neg(protectedDiv(neg(add(neg(ARG11), neg(ARG27))), sub(sub(sub(ARG54, ARG26), add(ARG34, ARG18)), add(neg(ARG49), add(ARG36, ARG20))))), add(protectedDiv(protectedDiv(neg(add(ARG17, ARG26)), protectedDiv(sub(ARG55, ARG22), add(ARG33, ARG35))), add(add(protectedDiv(ARG46, ARG26), neg(ARG5)), add(protectedDiv(ARG25, ARG11), neg(ARG2)))), protectedDiv(protectedDiv(mul(protectedDiv(ARG15, ARG0), add(ARG23, ARG55)), add(protectedDiv(ARG9, ARG11), neg(ARG11))), neg(mul(sub(ARG20, ARG24), add(ARG22, ARG28)))))), mul(protectedDiv(protectedDiv(protectedDiv(add(ARG34, ARG9), add(protectedDiv(ARG4, ARG35), add(ARG28, ARG20))), sub(neg(sub(ARG11, ARG30)), neg(protectedDiv(ARG43, ARG34)))), protectedDiv(add(sub(neg(ARG43), protectedDiv(ARG15, ARG15)), sub(neg(ARG16), neg(ARG14))), protectedDiv(sub(add(ARG7, ARG41), sub(ARG7, ARG34)), mul(mul(neg(mul(ARG5, ARG54)), ARG50), neg(ARG15))))), mul(add(protectedDiv(protectedDiv(sub(ARG17, ARG25), sub(ARG0,

-- Best Individual =  mul(protectedDiv(add(mul(mul(neg(protectedDiv(neg(ARG11), mul(ARG5, ARG39))), sub(sub(mul(ARG7, ARG6), neg(ARG17)), add(sub(sub(neg(add(neg(protectedDiv(ARG51, add(ARG44, ARG25))), ARG25)), neg(ARG17)), add(neg(add(mul(neg(ARG11), mul(ARG5, ARG39)), sub(ARG6, ARG42))), ARG15)), sub(ARG6, ARG42)))), ARG6), ARG15), protectedDiv(neg(ARG38), neg(ARG52))), mul(add(neg(neg(add(ARG9, ARG22))), protectedDiv(neg(protectedDiv(ARG47, protectedDiv(neg(protectedDiv(ARG47, ARG36)), neg(add(neg(ARG9), ARG5))))), neg(add(neg(ARG9), ARG25)))), mul(neg(protectedDiv(neg(sub(sub(mul(ARG7, ARG6), neg(ARG17)), add(neg(ARG51), ARG15))), add(ARG44, ARG25))), sub(mul(ARG7, ARG6), add(neg(ARG51), ARG6)))))
-- length=108, height=14
-- Best Fitness =  370.0


Currently on run 29 of 30


Create our graphs using the averages across all the runs. 

In [None]:
# Genetic Programming is done (all runs) - plot statistics:
x = numpy.arange(0, MAX_GENERATIONS+1)
avgArray = numpy.array(avgListFitness)
stdArray = numpy.array(stdListFitness)
minArray = numpy.array(minListFitness)
maxArray = numpy.array(maxListFitness)
plt.xlabel('Generation')
plt.ylabel('Fitness')
plt.title('Best and Average Fitness for Symbolic Regression')
#plt.errorbar(x, avgArray.mean(0), yerr=stdArray.mean(0),label="Average",color="Red")
plt.errorbar(x, minArray.mean(0), yerr=minArray.std(0),label="Best", color="Green")
plt.show()

Show the graph for size.

In [None]:
# Genetic Programming is done (all runs) - plot statistics:
x = numpy.arange(0, MAX_GENERATIONS+1)
avgArray = numpy.array(avgListSize)
stdArray = numpy.array(stdListSize)
minArray = numpy.array(minListSize)
maxArray = numpy.array(maxListSize)
plt.xlabel('Generation')
plt.ylabel('Size')
plt.title('Best and Average Size for Symbolic Regression')
plt.errorbar(x, avgArray.mean(0), yerr=stdArray.mean(0),label="Average",color="Red")
plt.errorbar(x, minArray.mean(0), yerr=minArray.std(0),label="Best", color="Blue")
plt.show()