In [6]:
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint

from algorithms.gp.gplearn import GpLearnConfig, GpLearnRegressor

In [7]:
config = GpLearnConfig(verbose=0)
pprint(config)

GpLearnConfig(population_size=5000,
              generations=20,
              stopping_criteria=0.01,
              p_crossover=0.7,
              p_subtree_mutation=0.1,
              p_hoist_mutation=0.1,
              p_point_mutation=0.05,
              max_samples=0.9,
              verbose=0,
              parsimony_coefficient=0.01,
              function_set=['add',
                            'sub',
                            'mul',
                            'div',
                            'sqrt',
                            'log',
                            'neg',
                            'inv',
                            'sin',
                            'cos',
                            'tan'],
              random_state=42)


In [8]:
train_df = pd.read_csv("dataset/train_df.csv", index_col=0)
sampled_df = train_df.groupby('number').apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)
sampled_df["path"] = sampled_df.apply(lambda row:os.path.join(row["filename"], f"{row['data_num']}.npy"), axis=1)
sampled_df.head()

  sampled_df = train_df.groupby('number').apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)


Unnamed: 0,filename,data_num,number,path
0,I.6.2a,83,1,I.6.2a/83.npy
1,I.6.2,83,2,I.6.2/83.npy
2,I.6.2b,83,3,I.6.2b/83.npy
3,I.8.14,83,4,I.8.14/83.npy
4,I.9.18,83,5,I.9.18/83.npy


In [9]:
models = []

for index, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    data = np.load(os.path.join("dataset", row["path"]))
    X = data[:, :-1]
    y = data[:, -1]

    try:
        regressor = GpLearnRegressor(config)
        model = regressor.predict_single(X, y)
    except:
        model = None

    models.append(model)


100%|██████████| 100/100 [2:11:18<00:00, 78.79s/it] 


In [12]:
models

[0.0500000000000000,
 0.112000000000000,
 0.354/X0,
 (X0 - X1)**0.5 + (-X2 + X3)**0.5,
 1/(X5 + 0.371),
 X0 + 0.08,
 X0*X2 + X1*X4 + X3*X5,
 X0*X1,
 2*X2 + 1.43322017675038*(X0**2*X1)**0.5*(X2 + X3) + log(X1 - X2) + log(cos(X1**0.5)) + cos(X2) - 2*tan(sin(X3)) - 1.11730529287029*I,
 0.0310000000000000,
 tan(0.104/(X1*X2)),
 X0*X1,
 X0*X1 + X0*X2*X3*sin(X4),
 X0*X1*X4*(X2 - X3)/(X2*X3),
 X0*X1*X2,
 2.04081632653061*X0*X1*log(X1) - X0*X1 + X0/X1 + log(X0) + cos(log(X1)**0.5),
 X0 - X3 + cos(X1) - 0.658,
 X3,
 X0*X1 + 1/X2,
 X0,
 (X2*X3)**0.5,
 X0*X1*sin(X2),
 X0*X1*X2*sin(X3),
 2.09168530845762*X0*X3*(-X1*X3*(X2 - 0.175) + X2 + sin((-X2*X3*(X2 - 0.175) + X2 + cos(X2))**0.5))**0.5,
 X0/X1,
 sin(X1)*tan(sin(X0)),
 X1/(X2 + X1/X0),
 X0/X1,
 1.76991150442478*((X1 + tan(log(0.648640115686581*X0)))*(-X2 + X3))**0.5,
 2*X0/X1,
 tan(X0/(X1*X2)),
 1/(X2*X3),
 X3**0.5*(X4 + sin(X5))*(X0*tan((X2 + X4 - X5**0.5)**0.5) + sin(X5))*log(X3*X4),
 X0*X1*X2/X3,
 X2 + (X1*X2 + X1**2*(X2 + X1*(X2 + X1**2*X2/

In [13]:
import pickle

data = {"df": sampled_df,
        "models": models,
        "config":config}

with open('gplearn_test.pkl', 'wb') as outp:
    pickle.dump(data, outp, pickle.HIGHEST_PROTOCOL)