In [1]:
import pandas as pd
import numpy as np
import os
import time
import random
import pickle

# MILP Optimization API
from pyscipopt import Model

# Genetic algorithm package
import leap_ec
from leap_ec.decoder import IdentityDecoder
from leap_ec.representation import Representation
from leap_ec import ops
from leap_ec.real_rep.ops import mutate_gaussian
from leap_ec.real_rep.initializers import create_real_vector
from leap_ec.problem import FunctionProblem
from leap_ec.distrib import DistributedIndividual, asynchronous
from leap_ec.distrib.probe import log_worker_location

# Asynchronous or distributed workers
from dask.distributed import Client, LocalCluster

#Import helper functions
from opt_functions import *

In [2]:
# Import relevant data for optimization

years = ["2014", "2015", "2016", "2017", "2018", "2019", "2021", "2022"]  #Years to be analyzied

# Import data frames with probabilities, chalk, and actual outcomes
cwd = os.getcwd()
file_path = "\Data\\final_bracket_data\\"
data_frames = {}
for year in years:
    data_frames[year] = pd.read_csv(cwd + file_path + year + ".csv")

#Record the scores for the chalk brackets
chalk_scores = {"2014": 690,
                "2015": 860,
                "2016": 990,
                "2017": 700,
                "2018": 1140,
                "2019": 950,
                "2021": 870,
                "2022": 790}

# Extract the most recent best distances
with open('best_d.pkl', 'rb') as f:
    best_d = pickle.load(f)
f.close()

#Set up bounds for individual initialization using best distance vector as baseline
init_bounds = [(d - 0.001, d + 0.001) for d in best_d ]
print(best_d)

[0.9984233  0.90651475 0.92809081 0.92713666 0.81409037 0.98628657]


In [3]:
# Genetic algorithm fitness functions
def fitness_avg(phenome):
    fitness = 0
    for year in data_frames.keys():
        output, x = opt_bracket(data_frames[year], phenome)
        results = get_results(output, x)
        fitness += brack_score(results, data_frames[year])
    return fitness/len(data_frames)


def fitness_avg_diff(phenome):
    fitness = 0
    for year in data_frames.keys():
        output, x = opt_bracket(data_frames[year], phenome)
        results = get_results(output, x)
        fitness += brack_score(results, data_frames[year]) - chalk_scores[year]
    return fitness/len(data_frames)


def fitness_num_better(phenome):
    fitness = 0
    for year in data_frames.keys():
        output, x = opt_bracket(data_frames[year], phenome)
        results = get_results(output, x)
        if brack_score(results, data_frames[year]) > chalk_scores[year]:
            fitness+=1
    return fitness

def fitness_sigmoid(phenome):
    fitness = 0
    for year in data_frames.keys():
        output, x = opt_bracket(data_frames[year], phenome)
        results = get_results(output, x)
        diff = brack_score(results, data_frames[year]) - chalk_scores[year]
        fitness += sigmoid(diff, 25)
    return fitness

In [None]:
#Asynchronous distributed genetic algorithm

MAX_BIRTHS = 128
INIT_POP_SIZE = 8
POP_SIZE = 8
file = open("test.csv", "w")

tic = time.perf_counter()
with Client(LocalCluster()) as client:
    final_pop = asynchronous.steady_state(client, # dask client
                                  births=MAX_BIRTHS,
                                  init_pop_size=INIT_POP_SIZE,
                                  pop_size=POP_SIZE,

                                  representation=Representation(
                                  decoder=IdentityDecoder(),             # Genotype and phenotype are the same for this task
                                  #initialize=create_real_vector(bounds=[(0.8, 1)]*6),  # Initial genomes are random gaussian
                                  initialize=create_real_vector(bounds=init_bounds),
                                  individual_cls=DistributedIndividual),

                                  problem=FunctionProblem(fitness_sigmoid, maximize=True),

                                  offspring_pipeline=[ops.tournament_selection, # Select parents via tournament_selection selection
                                  ops.clone,                                    # Copy them (just to be safe)
                                  ops.uniform_crossover,                         # Crossover each element with probability 0.2
                                  mutate_gaussian(std=.1/3, expected_num_mutations=2, hard_bounds=(0.8, 1)),                 # Basic mutation: gaussian mutation        
                                  ops.pool(size=1)             # Collect offspring into a new population
                                  ],
                                    
                                  evaluated_probe=log_worker_location(file))

toc = time.perf_counter()
file.close()
client.shutdown()
print("Solve time was:{} seconds".format(toc-tic))

In [None]:
# Retrieve the best individual
max_ind = [ind.fitness for ind in final_pop].index(max([ind.fitness for ind in final_pop]))
print(max_ind)
print(final_pop[max_ind].genome)
print(final_pop[max_ind].fitness)
print(fitness5(final_pop[max_ind].genome))

In [4]:
# See performance
d = best_d
for year in years:
    output, x = opt_bracket(data_frames[year], d)
    results = get_results(output, x)
    print(year + ":")
    print("d score is: {}".format(brack_score(results, data_frames[year])))
    print("chalk score is: {}".format(chalk_scores[year]))
    print('\n')

2014:
d score is: 760.0
chalk score is: 690


2015:
d score is: 850.0
chalk score is: 860


2016:
d score is: 1020.0
chalk score is: 990


2017:
d score is: 840.0
chalk score is: 700


2018:
d score is: 629.9999999999241
chalk score is: 1140


2019:
d score is: 1260.0
chalk score is: 950


2021:
d score is: 980.0
chalk score is: 870


2022:
d score is: 1070.0
chalk score is: 790


