<h1 style='font-size:70px'>
    Evolutionary dataset optimisation
</h1>
<br>
<font size='6'>
    Henry Wilde | 
</font><font size='4'>
    <i class='fa fa-github' aria-hidden='false' size='4'></i>
    <i class='fa fa-twitter' aria-hidden='false'></i>
</font><font size='6'>
    @daffidwilde
</font>

# Optimising a simple function

Suppose we want to find the global optimum of $f(x)=x^2$.

# Formulation

In [1]:
import edo

from edo.pdfs import Normal

In [2]:
def x_squared(df):

    return df.iloc[0, 0] ** 2

In [3]:
Normal.mean_limits = [-1, 1]
Normal.std_limits = [0, 1]

In [4]:
pop_history, fit_history = edo.run_algorithm(
    fitness=x_squared,
    size=50,
    row_limits=[1, 1],
    col_limits=[1, 1],
    families=[Normal],
    max_iter=10,
    mutation_prob=0.01,
    seed=0
)

# Looking at the results

In [5]:
import numpy as np

In [6]:
best = np.argmin(fit_history[fit_history["generation"] == 10]["fitness"].values)
ind = pop_history[-1][best]

In [7]:
ind.dataframe

Unnamed: 0,0
0,0.043709


In [8]:
ind.metadata

[{'mean': 1.0753214741215302,
  'std': 8.45983873461537,
  'name': 'Normal',
  'subtype_id': 1}]

In [9]:
# import matplotlib.pyplot as plt

# fig, (top, bottom) = plt.subplots(
#     nrows=2, ncols=3, figsize=(45, 30), dpi=300, sharex=True, sharey=True
# )

# fontsize = 36
# xs = np.linspace(-10, 10, 100)
# ys = [x ** 2 for x in xs]

# for i in range(6):

#     if i < 3:
#         axes = top
#     else:
#         axes = bottom

#     j = i % 3
#     data = [
#         [ind.dataframe.iloc[0, 0] for ind in pop_history[i]],
#         fit_history[fit_history["generation"] == i]["fitness"].values
#     ]

#     axes[j].plot(xs, ys, lw=3, zorder=-1)
#     axes[j].scatter(*data, s=200, color="orange")
#     axes[j].set_xlim(-5, 5)
#     axes[j].set_ylim(-1, 30)

#     axes[j].set_title(f"Fitness scores in epoch {i}", size=fontsize, pad=25)
#     if i in [3, 4, 5]:
#         axes[j].set_xlabel(r"$x$", size=fontsize)
#     if i in [0, 3]:
#         axes[j].set_ylabel(r"$f(x) = x^2$", size=fontsize)

#     for label in axes[j].get_xticklabels() + axes[j].get_yticklabels():
#         label.set_fontsize(fontsize * .8)

# plt.tight_layout(pad=5)
# plt.savefig('my_plot.svg', format='svg')

# Visualising the results

![](my_plot.svg)

# Why though?

# Something a little harder

Consider the task of finding two perfectly correlated, equally sized sets of numbers. One continuous, one discrete.

# Formulation

In [10]:
from scipy.stats import linregress
from edo.pdfs import Poisson, Normal

In [11]:
Normal.mean_limits = [-10, 10]
Normal.std_limits = [0, 10]

In [12]:
def r_squared(df):

    _, _, r, _, _ = linregress(df.iloc[:, 0].values, df.iloc[:, 1].values)
    return r ** 2

In [13]:
pop_history, fit_history = edo.run_algorithm(
    fitness=r_squared,
    size=100,
    row_limits=[10, 50],
    col_limits=[(1, 1), (1, 1)],
    families=[Normal, Poisson],
    max_iter=50,
    maximise=True,
    seed=0
)

# The best individual

In [14]:
# import matplotlib.pyplot as plt

# final_fit = fit_history[fit_history["generation"] == 50]["fitness"]

# best = np.argmax(final_fit.values)
# ind = pop_history[-1][best]
# df = ind.dataframe

# fig, ax = plt.subplots(1, figsize=(12, 8), dpi=300)

# ax.scatter(df.select_dtypes('int'), df.select_dtypes('float'))
# ax.annotate(
#     s=r'$r^2$' + f' = {np.round(np.max(final_fit.values), 4)}',
#     xy=[4, 1],
#     fontsize=20,
#     bbox=dict(boxstyle='round', fc='0.9')
# )

# ax.set_xlabel('Discrete column')
# ax.set_ylabel('Continuous column')

# plt.savefig('my_linregress_ind.svg', format='svg')

![](my_linregress_ind.svg)

In [15]:
# fig, ax = plt.subplots(figsize=(24, 12), dpi=300)

# ax.boxplot(
#     fit_history["fitness"].values.reshape(100, 51).T,
#     sym=".",
#     showmeans=True
# )

# ax.set_xlabel("Epoch")
# ax.set_ylabel("Fitness")

# ax.set_xticks(range(1, 101, 5))
# ax.set_xticklabels(range(0, 101, 5))

# plt.savefig("my_linregress_plot.png", transparent=True);

# Fitness progression

![](my_linregress_plot.png)

# What is `edo`?

- artificial data generation framework

- a genetic algorithm

# What is it for?

- creating banks of useful datasets

- understanding what makes a dataset "good" for a particular algorithm

Docs: [edo.readthedocs.io](https://edo.readthedocs.io/en/latest/)
<br>
Source: [github.com/daffidwilde/edo](https://github.com/daffidwilde/edo)

# What is a genetic algorithm?
<img align='centre' width='700px' src='flowchart.svg'/>