In [None]:
import cv2
import pandas as pd
import re
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from keras.models import load_model

In [None]:
# Set up constants
TIMEFRAMES = [7, 14, 30]
PREDICTIONS = [1, 5]
IMG_TYPES = ['MPA', 'CPA', 'MLC', 'API']

# Testing is default
# Select if you want to use transfer learning instead
TRANSFER = False
if TRANSFER:
    labels = pd.read_csv('labels/transfer_labels.csv')
    # Select the ETF to be analyzed
    #ETF = 'Silver'
    #ETF = 'Treasury'
    ETF = 'SmallCap'
else:
    labels = pd.read_csv('labels/test_labels.csv')
    ETF = 'MSCIWorld'

In [None]:
labels = labels[labels['Image'].str.contains(f'{ETF}')]

In [None]:
# Crop the images using the bounding boxes
def crop_image(img_path):
    # Load the image in grayscale
    img = cv2.imread(img_path, 0)

    # Check if the image was loaded correctly
    if img is None:
        raise ValueError(f"Image at {img_path} not found. Please check the path.")

    # Use regular expression to match numbers followed by ".png" at the end of the filename
    match = re.search(r'(\d+)(?=\.png$)', img_path)
    
    # Check if we found a match
    if match:
        # Extract the number from the matched group
        number = int(match.group(1))
        
        # Check if the number is one of the specified values
        if number == 7:
            # Crop the image using the bounding rectangle
            crop = img[105:105+115, 80:80+38]
        elif number == 14:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+85]
        elif number == 30:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+132]
        elif number == 90:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+226]
        elif number == 180:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+414]
    return crop

# Example usage:
filenames = labels['Image'].values.tolist()

# Testing the function with the provided list of filenames
for name in filenames:
    try:
        cropped_image = crop_image(name)
        # Construct the new path for the cropped image
        new_path = name.replace('.png', '_cropped.png')
        # Save the cropped image
        cv2.imwrite(new_path, cropped_image)
    except ValueError as e:
        print(e)


In [None]:
# Create a new column called 'Image' that contains the path to the cropped image but only if they 
labels['Image'] = labels['Image'].str.replace('.png', '_cropped.png')

In [None]:
# Function to load and convert an image to grayscale
def load_image(image_path):
    # Load image in grayscale
    image = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    if image is None:
        raise ValueError(f"Unable to load image at path: {image_path}")
    return image

filenames = labels['Image'].values.tolist()

images = []

for name in filenames:
    try:
        img = load_image(name)
        images.append(img)
    except ValueError as e:
        print(e)

# Add a new column to the labels DataFrame to store the image arrays
labels['Image_Array'] = images

In [None]:
# Show the number of 1s and 0s in the dataset
print(labels['Label'].value_counts())

In [None]:
# Sort the DataFrame by date
labels['Date'] = labels['Image'].str.extract(r'(\d{4}-\d{2}-\d{2})')
labels['Date'] = pd.to_datetime(labels['Date'])
labels = labels.sort_values(by='Date') 
print(labels.head())

In [None]:
def get_investment_return(y_pred_binary, lastPrice, futurePrice):
    # Calculate Rate of Return RoR for each trade independently
    # Long positions
    long_ror = []
    long_investment = 0
    # Short positions
    short_ror = []
    short_investment = 0

    y_pred_binary = y_pred_binary.tolist()
    for i in range(len(y_pred_binary)):
        # Long position
        if y_pred_binary[i] == 1:
            long_return_i = ((futurePrice[i] - lastPrice[i])/lastPrice[i])
            long_ror.append(long_return_i)
            long_investment += 100
        # Short position
        else:
            short_return_i = ((lastPrice[i] - futurePrice[i])/lastPrice[i])
            short_ror.append(short_return_i)
            short_investment += 100
    
    if long_investment > 0:
        # Calculate average long RoR
        long_avg_ror = np.mean(long_ror)
    else:
        long_avg_ror = 0
    
    if short_investment > 0:
        # Calculate average short RoR
        short_avg_ror = np.mean(short_ror)
    else:
        short_avg_ror = 0

    return long_avg_ror, long_investment, short_avg_ror, short_investment

In [None]:
def get_actual_returns(y_pred_binary, lastPrice, futurePrice, prediction):
    y_pred_binary = y_pred_binary.tolist()

    # Create a list that will store the investment as long as it is locked due to the prediction window
    long_investment = [0] * prediction
    short_investment = [0] * prediction
    both_investment = [0] * prediction

    counter = 0
    for i in range(len(y_pred_binary)):
        if counter == prediction:
            counter = 0
            
        # Long position
        if y_pred_binary[i] == 1:
            long_return = ((futurePrice[i] - lastPrice[i])/lastPrice[i]) + 1
            long_investment[counter] = (long_investment[counter]+100)*(long_return)
            both_investment[counter] = (both_investment[counter]+100)*(long_return)
            counter += 1

        # Short position
        else:
            short_return = ((lastPrice[i] - futurePrice[i])/lastPrice[i]) + 1
            short_investment[counter] = (short_investment[counter]+100)*(short_return)
            both_investment[counter] = (both_investment[counter]+100)*(short_return)
            counter += 1

    # Sum all long trades
    long_returns = np.sum(long_investment)
    # Sum all short trades
    short_returns = np.sum(short_investment)
    # Sum all trades
    both_returns = np.sum(both_investment)

    return long_returns, short_returns, both_returns

In [None]:
# Get baseline RoR for each image type, timeframe, and prediction

ror_df = pd.DataFrame(columns=['Image_Type', 'Timeframe', 'Prediction', 'Investment', 'Return', 'RoR'])

for img_type in IMG_TYPES:
    for timeframe in TIMEFRAMES:
        for prediction in PREDICTIONS:
            if prediction <= timeframe:

                # Filter your data based on prediction, img_type, and timeframe
                data = labels[(labels['TimePrediction'] == prediction) &
                              (labels['Image'].str.contains(f'/{img_type}/')) &
                              (labels['Image'].str.contains(f'_{timeframe}_'))]                
                data = data.reset_index(drop=True)

                investment = 0
                returns = 0
            
                lastPrice = data['LastPrice']
                # Futur price is the last price in the dataset at which all ETFs will be sold
                futurePrice = data['FuturePrice'].iloc[-1]

                # Calculate Rate of Return RoR per trade
                ror = np.array((futurePrice - lastPrice)/lastPrice)

                # Number of trades
                investment += len(ror)*100

                # Sum all trades
                returns += np.sum((1+ror)*100)
                # Calculate average RoR

                total_ror = returns/investment

                # Add the evaluation metrics to the DataFrame
                ror_df = ror_df.append({
                    'Image_Type': img_type,
                    'Timeframe': timeframe,
                    'Prediction': prediction,
                    'Investment': investment,
                    'Return': returns,
                    'RoR': total_ror
                }, ignore_index=True)

# Save the DataFrame to a CSV file
ror_df.to_csv(f'baseline/{ETF}_baseline_ror.csv', index=False)

In [None]:
# Separate the data into 12 windows and evaluate the model on each window

evaluation_df = pd.DataFrame(columns=['Image_Type', 'Timeframe', 'Prediction', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'Hit_Rate', 
                                      'Long_Average_RoR', 'Long_Investment', 'Long_Returns', 'Long_RoR', 
                                      'Short_Average_RoR', 'Short_Investment', 'Short_Returns', 'Short_RoR', 
                                      'Total_Investment', 'Total_Returns', 'Total_RoR'])

TIMEFRAMES = [7, 14, 30]
PREDICTIONS = [1, 5]
IMG_TYPES = ['MPA', 'CPA', 'MLC', 'API']

for img_type in IMG_TYPES:
    for timeframe in TIMEFRAMES:
        for prediction in PREDICTIONS:
            if prediction <= timeframe:
                print(f"Evaluating model predicting {prediction} days ahead using {img_type} images with {timeframe} days timeframe.")

                # Filter the data
                data = labels[(labels['TimePrediction'] == prediction) &
                              (labels['Image'].str.contains(f'/{img_type}/')) &
                              (labels['Image'].str.contains(f'_{timeframe}_'))]
                data = data.reset_index(drop=True)

                # Load the model
                model_filename = f"models/MSCIWorld_{img_type}_{timeframe}_{prediction}.h5"
                model = load_model(model_filename)
                

                X = np.array(data['Image_Array'].tolist()) / 255.0
                lastPrice = data['LastPrice'].tolist()
                futurePrice = data['FuturePrice'].tolist()
                y = data['Label'].values
                
                # Initialize y_pred_binary as an empty array
                y_pred_binary = np.array([], dtype=int)
                
                # Create a rolling window of 10
                window_size = len(X)/12
                print(int(window_size))

                for window in range(12):
                    if window == 0:
                        window_X = X[0:int(window_size)]
                        window_lastPrice = lastPrice[0:int(window_size)]
                        window_futurePrice = futurePrice[0:int(window_size)]
                        window_y = y[0:int(window_size)]

                        # Evaluate the model on test data
                        y_pred_window = model.predict(window_X)
                        # Convert predictions to binary: if > 0.5 then 1 else 0
                        y_pred_binary_window = np.where(y_pred_window > 0.5, 1, 0).flatten()
                        y_pred_binary = np.concatenate([y_pred_binary, y_pred_binary_window])
                    elif window == 11:
                        model.fit(window_X, window_y, epochs=10, batch_size=32, validation_split=0.2)
                        window_X = X[int(window_size*window):]
                        window_lastPrice = lastPrice[int(window_size*window):]
                        window_futurePrice = futurePrice[int(window_size*window):]
                        window_y = y[int(window_size*window):]

                        # Evaluate the model on test data
                        y_pred_window = model.predict(window_X)
                        # Convert predictions to binary: if > 0.5 then 1 else 0
                        y_pred_binary_window = np.where(y_pred_window > 0.5, 1, 0).flatten()
                        y_pred_binary = np.concatenate([y_pred_binary, y_pred_binary_window])
                    else:
                        model.fit(window_X, window_y, epochs=10, batch_size=32, validation_split=0.2)
                        window_X = X[int(window_size*window):int(window_size*(window+1))]
                        window_lastPrice = lastPrice[int(window_size*window):int(window_size*(window+1))]
                        window_futurePrice = futurePrice[int(window_size*window):int(window_size*(window+1))]
                        window_y = y[int(window_size*window):int(window_size*(window+1))]

                        # Evaluate the model on test data
                        y_pred_window = model.predict(window_X)
                        # Convert predictions to binary: if > 0.5 then 1 else 0
                        y_pred_binary_window = np.where(y_pred_window > 0.5, 1, 0).flatten()
                        y_pred_binary = np.concatenate([y_pred_binary, y_pred_binary_window])
                
                accuracy = accuracy_score(y, y_pred_binary)
                precision = precision_score(y, y_pred_binary)
                recall = recall_score(y, y_pred_binary)
                f1_score = fbeta_score(y, y_pred_binary, beta=1)

                y_test_array = y.ravel()  # Convert y_test to a 1D NumPy array if it's a pandas Series
                correct_predictions = np.sum(y_pred_binary.ravel() == y_test_array)
                hit_rate = correct_predictions / len(y_test_array)

                long_avg_ror, long_investment, short_avg_ror, short_investment = get_investment_return(y_pred_binary, lastPrice, futurePrice)
                # Total investment
                total_investment = long_investment + short_investment

                long_returns, short_returns, both_returns = get_actual_returns(y_pred_binary, lastPrice, futurePrice, prediction)

                
                # Add the evaluation metrics to the DataFrame
                evaluation_df = evaluation_df.append({
                    'Image_Type': img_type,
                    'Timeframe': timeframe,
                    'Prediction': prediction,
                    'Accuracy': accuracy,
                    'Precision': precision,
                    'Recall': recall,
                    'F1_Score': f1_score,
                    'Hit_Rate': hit_rate,
                    'Long_Average_RoR': long_avg_ror,
                    'Long_Investment': long_investment,
                    'Long_Returns' : long_returns,
                    'Long_RoR': long_returns/long_investment,
                    'Short_Average_RoR': short_avg_ror,
                    'Short_Investment': short_investment, 
                    'Short_Returns' : short_returns,
                    'Short_RoR': short_returns/short_investment,
                    'Total_Investment': total_investment,
                    'Total_Returns': both_returns,
                    'Total_RoR': both_returns/total_investment
                }, ignore_index=True)

# Save the evaluation DataFrame to a CSV file
if TRANSFER:
    evaluation_df.to_csv(f'evaluation/separate/{ETF}_rolling_transfer_evaluation_scores.csv', index=False)
    print(f"Evaluation scores saved to 'evaluation/separate/{ETF}_rolling_tansfer_evaluation_scores.csv'.")
else:
    evaluation_df.to_csv('evaluation/separate/MSCIWorld_rolling_test_evaluation_scores.csv', index=False)
    print("Evaluation scores saved to 'evaluation/separate/MSCIWorld_rolling_test_evaluation_scores.csv'.")

In [None]:
# Overlay our three new images and evaluate the combined models on the newly crated dataset using rolling windows

evaluation_df = pd.DataFrame(columns=['Timeframe', 'Prediction', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'Hit_Rate', 
                                      'Long_Average_RoR', 'Long_Investment', 'Long_Returns', 'Long_RoR', 
                                      'Short_Average_RoR', 'Short_Investment', 'Short_Returns', 'Short_RoR', 
                                      'Total_Investment', 'Total_Returns', 'Total_RoR'])

TIMEFRAMES = [7, 14, 30]
PREDICTIONS = [1, 5]

IMG_TYPES = ['CPA', 'MLC', 'API']

for timeframe in TIMEFRAMES:
    for prediction in PREDICTIONS:
        if prediction <= timeframe:
            print(f"Evaluating model predicting {prediction} days ahead using overelayed images with {timeframe} days timeframe.")

            # Filter the data
            data = labels[(labels['TimePrediction'] == prediction) &
                            (labels['Image'].str.contains(f'_{timeframe}_'))]
            data = data.reset_index(drop=True)

            # Build df with overlayed images
            overlayed_df = pd.DataFrame(columns=['Image_Array', 'LastPrice', 'FuturePrice', 'Label'])

            # For each timeframe, prediction and ETF overlay the images
            for date in data['Date'].unique():
                try:
                    # Filter the data
                    data_date = data[(data['Date'] == date)]
                    # Load the images
                    img1 = data_date[data_date['Image'].str.contains('CPA')]['Image_Array'].values[0]
                    img2 = data_date[data_date['Image'].str.contains('MLC')]['Image_Array'].values[0]
                    img3 = data_date[data_date['Image'].str.contains('API')]['Image_Array'].values[0]
                    # Overlay the images
                    img = cv2.addWeighted(img1, 0.5, img2, 0.5, 0)
                    img = cv2.addWeighted(img, 0.5, img3, 0.5, 0)

                    # Save image to file
                    cv2.imwrite(f'images/overlayed/{ETF}_{date}_{prediction}_{timeframe}.png', img)

                    # Add the image to the DataFrame
                    overlayed_df = overlayed_df.append({
                        'Image_Array': img,
                        'LastPrice': data_date['LastPrice'].values[0],
                        'FuturePrice': data_date['FuturePrice'].values[0],
                        'Label': data_date['Label'].values[0]
                    }, ignore_index=True)
                except:
                    pass
            overlayed_df = overlayed_df.reset_index(drop=True)

            # Load the model
            model_filename = f"models/MSCIWorld_combined_{timeframe}_{prediction}.h5"
            model = load_model(model_filename)

            X = np.array(overlayed_df['Image_Array'].tolist()) / 255.0
            lastPrice = overlayed_df['LastPrice'].tolist()
            futurePrice = overlayed_df['FuturePrice'].tolist()
            y = overlayed_df['Label'].values
            y = np.array(y, dtype=np.int32)

            # Initialize y_pred_binary as an empty array
            y_pred_binary = np.array([], dtype=int)
            
            # Create a rolling window of 10
            window_size = len(X)/12
            print(int(window_size))

            for window in range(12):
                if window == 0:
                    window_X = X[0:int(window_size)]
                    window_lastPrice = lastPrice[0:int(window_size)]
                    window_futurePrice = futurePrice[0:int(window_size)]
                    window_y = y[0:int(window_size)]

                    # Evaluate the model on test data
                    y_pred_window = model.predict(window_X)
                    # Convert predictions to binary: if > 0.5 then 1 else 0
                    y_pred_binary_window = np.where(y_pred_window > 0.5, 1, 0).flatten()
                    y_pred_binary = np.concatenate([y_pred_binary, y_pred_binary_window])
                elif window == 11:
                    model.fit(window_X, window_y, epochs=10, batch_size=32)
                    window_X = X[int(window_size*window):]
                    window_lastPrice = lastPrice[int(window_size*window):]
                    window_futurePrice = futurePrice[int(window_size*window):]
                    window_y = y[int(window_size*window):]

                    # Evaluate the model on test data
                    y_pred_window = model.predict(window_X)
                    # Convert predictions to binary: if > 0.5 then 1 else 0
                    y_pred_binary_window = np.where(y_pred_window > 0.5, 1, 0).flatten()
                    y_pred_binary = np.concatenate([y_pred_binary, y_pred_binary_window])
                else:
                    model.fit(window_X, window_y, epochs=10, batch_size=32)
                    window_X = X[int(window_size*window):int(window_size*(window+1))]
                    window_lastPrice = lastPrice[int(window_size*window):int(window_size*(window+1))]
                    window_futurePrice = futurePrice[int(window_size*window):int(window_size*(window+1))]
                    window_y = y[int(window_size*window):int(window_size*(window+1))]

                    # Evaluate the model on test data
                    y_pred_window = model.predict(window_X)
                    # Convert predictions to binary: if > 0.5 then 1 else 0
                    y_pred_binary_window = np.where(y_pred_window > 0.5, 1, 0).flatten()
                    y_pred_binary = np.concatenate([y_pred_binary, y_pred_binary_window])       

            
            accuracy = accuracy_score(y, y_pred_binary)
            precision = precision_score(y, y_pred_binary)
            recall = recall_score(y, y_pred_binary)
            f1_score = fbeta_score(y, y_pred_binary, beta=1)

            y_test_array = y.ravel()  # Convert y_test to a 1D NumPy array if it's a pandas Series
            correct_predictions = np.sum(y_pred_binary.ravel() == y_test_array)
            hit_rate = correct_predictions / len(y_test_array)

            long_avg_ror, long_investment, short_avg_ror, short_investment = get_investment_return(y_pred_binary, lastPrice, futurePrice)
            # Total investment
            total_investment = long_investment + short_investment

            long_returns, short_returns, both_returns = get_actual_returns(y_pred_binary, lastPrice, futurePrice, prediction)
            #total_returns = long_returns + short_returns
            
            # Add the evaluation metrics to the DataFrame
            evaluation_df = evaluation_df.append({
                'Timeframe': timeframe,
                'Prediction': prediction,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1_Score': f1_score,
                'Hit_Rate': hit_rate,
                'Long_Average_RoR': long_avg_ror,
                'Long_Investment': long_investment,
                'Long_Returns' : long_returns,
                'Long_RoR': long_returns/long_investment,
                'Short_Average_RoR': short_avg_ror,
                'Short_Investment': short_investment, 
                'Short_Returns' : short_returns,
                'Short_RoR': short_returns/short_investment,
                'Total_Investment': total_investment,
                'Total_Returns': both_returns,
                'Total_RoR': both_returns/total_investment
            }, ignore_index=True)

# Save the evaluation DataFrame to a CSV file
if TRANSFER:
    evaluation_df.to_csv(f'evaluation/overlayed/{ETF}_rolling_transfer_evaluation_scores.csv', index=False)
    print(f"Evaluation scores saved to 'evaluation/overlayed/{ETF}_rolling_tansfer_evaluation_scores.csv'.")
else:
    evaluation_df.to_csv('evaluation/overlayed/MSCIWorld_rolling_test_evaluation_scores.csv', index=False)
    print("Evaluation scores saved to 'evaluation/overlayed/MSCIWorld_rolling_test_evaluation_scores.csv'.")