Copy the file **spy.2008.2021.csv.gz** and **spy.csv.gz** located in https://github.com/crapher/medium/tree/main/13.TrendIntraday/data to your colab folder

In [1]:
!pip install pygad



In [2]:
import os
import math
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score
from tqdm import tqdm

import numpy as np
import pygad

In [3]:
# Constants
BARS=15
RESULT_DIR='./result'
DATE_SPLIT='2019-06-01'
SOLUTIONS = 30
GENERATIONS = 50

In [4]:
# Get the data and generate the train, validation, and test datasets
def get_datasets():

    train_val = pd.read_csv('./spy.2008.2021.csv.gz', compression='gzip')
    train_val = train_val[['date','open','close']]
    train_val['date'] = pd.to_datetime(train_val['date'])

    test = pd.read_csv('./spy.csv.gz', compression='gzip')
    test = test[['date','open','close']]
    test['date'] = pd.to_datetime(test['date'])

    train = train_val[train_val['date'] <= DATE_SPLIT]
    validation = train_val[(train_val['date'] > DATE_SPLIT) & (train_val['date'] < test['date'].min())]

    return train, validation, test

In [5]:
# Process Dataframe and return features and targets
def get_features_targets(df, scale_obs=True):

    feature_result = []
    dates = []

    # Remove duplicated dates
    df = df.groupby(by='date').mean().reset_index()

    # Get Features based on BARS configuration
    features = df[((df['date'].dt.hour == 9) & (df['date'].dt.minute >= 30)) &
                   (df['date'].dt.hour == 9) & (df['date'].dt.minute < 30 + BARS)]
    features = features.groupby(features['date'].dt.date)

    for dt, feature in features:

        if len(feature) != BARS:
            feature = feature.set_index('date')
            feature = feature.resample('1T').asfreq().reindex(pd.date_range(str(dt) + ' 09:30:00', str(dt) + f' 09:{30+BARS-1}:00', freq='1T'))
            feature = feature.reset_index()
            feature['close'] = feature['close'].fillna(method='ffill')
            feature['open'] = feature['open'].fillna(feature['close'])
            feature = feature.dropna()

        if len(feature) == BARS:
            feature = feature['close'].values

            if scale_obs:
                feature -= np.min(feature)
                feature /= np.max(np.abs(feature))
                feature = np.nan_to_num(feature, nan=0.0, posinf=0.0, neginf=0.0)

            feature_result.append(feature)
            dates.append(dt)

    # Get Targets Trend based on first and last value / day (0: DOWN - 1: UP)
    targets = df.set_index('date')
    targets = targets.resample('1D').agg({'open':'first', 'close':'last'})
    targets = targets.loc[dates].reset_index().sort_values(by='date')
    targets['trend'] = np.where(targets['open'] < targets['close'], 1, 0)

    print(len(feature_result), len(targets))
    return np.array(feature_result), np.array(targets['trend'].values)

In [6]:
# Show the result of the operation
def show_result(target, pred, ds_type='TEST'):

    target = np.array(target)
    pred = np.array(pred)

    print(f' RESULT {ds_type.upper()} '.center(56, '*'))

    print('* Confusion Matrix (Top: Predicted - Left: Real)')
    print(confusion_matrix(y_true=target, y_pred=pred))

    print('* Classification Report')
    print(classification_report(target, pred))

In [7]:
# Save the results
def save_result(target, pred, name):

    os.makedirs(RESULT_DIR, exist_ok=True)
    df = pd.DataFrame.from_dict({'pred': pred, 'target': target})
    df.to_csv(f'{RESULT_DIR}/{name}', index=False)

In [8]:
# Return the predicted values
def get_predicted_values(solution, features):

    pred_y = np.clip(np.dot(features, solution), 0, 1)
    pred_y = np.where(pred_y > 0.5, 1, 0)
    return pred_y

In [9]:
# Define fitness function to be used by the PyGAD instance
def fitness_func(self, solution, sol_idx):

    global train_x, train_y

    pred_y = get_predicted_values(solution, train_x)
    result = f1_score(train_y, pred_y, average='binary', pos_label=1) + \
             f1_score(train_y, pred_y, average='binary', pos_label=0)

    return result

In [10]:
# Get the datasets to be used in the tests
train, validation, test = get_datasets()

In [11]:
# Get Features and Targets
train_x, train_y = get_features_targets(train)
val_x, val_y = get_features_targets(validation)
test_x, test_y = get_features_targets(test)

2858 2858
487 487
529 529


In [12]:
# Train Model
with tqdm(total=GENERATIONS) as pbar:

    # Create Genetic Algorithm
    ga_instance = pygad.GA(num_generations=GENERATIONS,
                           num_parents_mating=5,
                           fitness_func=fitness_func,
                           sol_per_pop=SOLUTIONS,
                           num_genes=BARS,
                           gene_space={'low': -1, 'high': 1},
                           random_seed=42,
                           on_generation=lambda _: pbar.update(1),
                           )

    # Run the Genetic Algorithm
    ga_instance.run()

100%|██████████| 50/50 [00:28<00:00,  1.78it/s]


In [13]:
# Set the best weights
solution, _, _ = ga_instance.best_solution()

In [14]:
# Predict and save train values
pred_y = get_predicted_values(solution, train_x)
save_result(train_y, pred_y, 'ga.train.csv.gz')

In [15]:
# Predict and save validation values
pred_y = get_predicted_values(solution, val_x)
save_result(val_y, pred_y, 'ga.val.csv.gz')

In [16]:
# Predict, show and save test values
pred_y = get_predicted_values(solution, test_x)
show_result(test_y, pred_y)
save_result(test_y, pred_y, 'ga.test.csv.gz')

********************* RESULT TEST **********************
* Confusion Matrix (Top: Predicted - Left: Real)
[[149 100]
 [ 97 183]]
* Classification Report
              precision    recall  f1-score   support

           0       0.61      0.60      0.60       249
           1       0.65      0.65      0.65       280

    accuracy                           0.63       529
   macro avg       0.63      0.63      0.63       529
weighted avg       0.63      0.63      0.63       529

