In [3]:
import cv2
import pandas as pd
import re
import numpy as np
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Conv2D, LeakyReLU, MaxPooling2D, Flatten, Dense



In [4]:
TIMEFRAMES = [30, 90, 180, 365]
PREDICTIONS = [5, 30, 90]
IMG_TYPES = ['OHLC', 'ColoredOHLC', 'Line', 'AlgoTrading']

labels = pd.read_csv('labels.csv')

In [5]:
# Find bounding boxes of all 4 image sizes
def find_bounding_box(img_path):
    # Load the image
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Find all non-black pixels
    coords = cv2.findNonZero(gray)
    
    # Find the bounding box of those pixels
    x, y, w, h = cv2.boundingRect(coords)
    print(x, y, w, h)

# Define the paths to your images
image_paths = [
    'images/Line/AI&Robotics_2019-01-03 00:00:00_30.png',
    'images/Line/AI&Robotics_2019-03-01 00:00:00_90.png',
    'images/Line/AI&Robotics_2019-05-31 00:00:00_180.png', 
    'images/Line/AI&Robotics_2020-03-05 00:00:00_365.png'   
]

# Crop each image and save the result
for path in image_paths:
    cropped_image = find_bounding_box(path)

80 100 132 120
80 100 226 120
80 100 414 120
80 100 602 120


In [6]:
# Crop the images using the bounding boxes
def crop_image(img_path):
    # Load the image in grayscale
    img = cv2.imread(img_path, 0)

    # Check if the image was loaded correctly
    if img is None:
        raise ValueError(f"Image at {img_path} not found. Please check the path.")

    # Use regular expression to match numbers followed by ".png" at the end of the filename
    match = re.search(r'(\d+)(?=\.png$)', img_path)
    
    # Check if we found a match
    if match:
        # Extract the number from the matched group
        number = int(match.group(1))
        
        # Check if the number is one of the specified values
        if number == 30:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+132]
        elif number == 90:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+226]
        elif number == 180:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+414]
        elif number == 365:
            # Crop the image using the bounding rectangle
            crop = img[100:100+120, 80:80+602]
    return crop

# Example usage:
filenames = labels['Image'].values.tolist()

# Testing the function with the provided list of filenames
for name in filenames:
    try:
        cropped_image = crop_image(name)
        # Construct the new path for the cropped image
        new_path = name.replace('.png', '_cropped.png')
        # Save the cropped image
        cv2.imwrite(new_path, cropped_image)
    except ValueError as e:
        print(e)


In [49]:
# Create a new column called 'Image' that contains the path to the cropped image but only if they 
labels['Image'] = labels['Image'].str.replace('.png', '_cropped.png')

  labels['Image'] = labels['Image'].str.replace('.png', '_cropped.png')


In [7]:
# Function to load and convert an image to grayscale
def load_image(image_path):
    # Load image in grayscale
    image = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    if image is None:
        raise ValueError(f"Unable to load image at path: {image_path}")
    return image

filenames = labels['Image'].values.tolist()

images = []

for name in filenames:
    try:
        img = load_image(name)
        images.append(img)
    except ValueError as e:
        print(e)

# Add a new column to the labels DataFrame to store the image arrays
labels['Image_Array'] = images

In [8]:
# Divide data based on timeframes
labels_dict = {
"30" : labels[labels['Image'].str.contains('_30_')],
"90" : labels[labels['Image'].str.contains('_90_')],
"180" : labels[labels['Image'].str.contains('_180_')],
"365" : labels[labels['Image'].str.contains('_365_')]
}

In [9]:
for key in labels_dict:
    # Convert the Image_Array column to a numpy array
    labels_dict[key]['Image_Array'] = np.array(labels_dict[key]['Image_Array']) / 255.0
    # Shuffle the DataFrame
    #labels_dict[key] = shuffle(labels_dict[key], random_state=42).reset_index(drop=True)


In [10]:
def create_cnn_model(num_blocks, initial_filters=32, input_shape=()):
    model = Sequential()

    for i in range(num_blocks):
        # Adjust the number of filters
        filters = initial_filters * (2 ** i)

        # Set dilation rate and strides
        dilation_rate = (2, 1) if i == 0 else (1, 1)
        strides = (1, 1)

        # Add Convolutional layer
        model.add(Conv2D(filters, kernel_size=(5, 3), strides=strides, padding='same',
                         input_shape=input_shape if i == 0 else model.output_shape[1:],
                         dilation_rate=dilation_rate))

        # Add LeakyReLU layer
        model.add(LeakyReLU(alpha=0.01))

        # Add MaxPooling layer only if dimensions allow
        if model.output_shape[1] >= 2 and model.output_shape[2] >= 2:
            model.add(MaxPooling2D(pool_size=(2, 2)))

    # Add Flatten and Dense layers for final prediction
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))  # Example Dense layer, adjust as needed
    model.add(Dense(1, activation='sigmoid'))  # Adjust the final layer as per your classification/regression needs

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Adjust these parameters as needed

    return model

# Create models with different numbers of blocks
model_30 = create_cnn_model(3, input_shape=(120, 132, 1))  # Adjust input_shape as per your image size
model_90 = create_cnn_model(4, input_shape=(120, 226, 1))
model_180 = create_cnn_model(5, input_shape=(120, 414, 1))
model_365 = create_cnn_model(6, input_shape=(120, 602, 1))

# Example: Print summary of one of the models
model_365.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 120, 602, 32)      512       
                                                                 
 leaky_re_lu_12 (LeakyReLU)  (None, 120, 602, 32)      0         
                                                                 
 max_pooling2d_12 (MaxPooli  (None, 60, 301, 32)       0         
 ng2D)                                                           
                                                                 
 conv2d_13 (Conv2D)          (None, 60, 301, 64)       30784     
                                                                 
 leaky_re_lu_13 (LeakyReLU)  (None, 60, 301, 64)       0         
                                                                 
 max_pooling2d_13 (MaxPooli  (None, 30, 150, 64)       0         
 ng2D)                                                

In [28]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical


# Split the data
X = np.array(labels_dict["30"]['Image_Array'].tolist())  # Assuming 'Image_Array' column contains image data
y = labels_dict["30"]['Label']  # Assuming 'Label' column contains labels

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [36]:
# Train the model
model_30.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

# Evaluate the model
model_30.evaluate(X_val, y_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6965636014938354, 0.5333333611488342]

In [37]:
# Split the data
X = np.array(labels_dict["365"]['Image_Array'].tolist())  # Assuming 'Image_Array' column contains image data
y = labels_dict["365"]['Label']  # Assuming 'Label' column contains labels

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [38]:
# Train the model
model_365.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

# Evaluate the model
model_365.evaluate(X_val, y_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6999983191490173, 0.5359477400779724]

In [39]:
# Split the data
X = np.array(labels_dict["180"]['Image_Array'].tolist())  # Assuming 'Image_Array' column contains image data
y = labels_dict["180"]['Label']  # Assuming 'Label' column contains labels

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model_180.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

# Evaluate the model
model_180.evaluate(X_val, y_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6922556757926941, 0.5233917832374573]

In [40]:
# Split the data
X = np.array(labels_dict["90"]['Image_Array'].tolist())  # Assuming 'Image_Array' column contains image data
y = labels_dict["90"]['Label']  # Assuming 'Label' column contains labels

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model_90.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

# Evaluate the model
model_90.evaluate(X_val, y_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6896867156028748, 0.5416666865348816]

In [43]:
for type in IMG_TYPES:
    for timeframe in TIMEFRAMES:
        for prediction in PREDICTIONS:
            if prediction < timeframe:
                print(f"Predicting {prediction} days ahead using {type} images with {timeframe} days timeframe.")
                data = labels[labels['TimePrediction'] == prediction]
                data = labels[labels['Image'].str.contains(type)]
                data = labels[labels['Image'].str.contains(f'_{timeframe}_')]
                data['Image_Array'] = np.array(data['Image_Array']) / 255.0
                data = shuffle(data, random_state=42).reset_index(drop=True)

                if timeframe == 30:
                    model = create_cnn_model(3, input_shape=(120, 132, 1))
                elif timeframe == 90:
                    model = create_cnn_model(4, input_shape=(120, 226, 1))
                elif timeframe == 180:
                    model = create_cnn_model(5, input_shape=(120, 414, 1))
                elif timeframe == 365:
                    model = create_cnn_model(6, input_shape=(120, 602, 1))
                model.summary()
                
                X = np.array(data['Image_Array'].tolist())
                y = data['Label']
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
                model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))
                model.evaluate(X_val, y_val)
                model.save(f"models/{type}_{timeframe}_{prediction}.h5")
                print(f"Model saved as {type}_{timeframe}_{prediction}.h5")
                print("--------------------------------------------------")

Predicting 5 days ahead using OHLC images with 30 days timeframe.
Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_166 (Conv2D)         (None, 120, 132, 32)      512       
                                                                 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Image_Array'] = np.array(data['Image_Array']) / 255.0


 leaky_re_lu_164 (LeakyReLU  (None, 120, 132, 32)      0         
 )                                                               
                                                                 
 max_pooling2d_128 (MaxPool  (None, 60, 66, 32)        0         
 ing2D)                                                          
                                                                 
 conv2d_167 (Conv2D)         (None, 60, 66, 64)        30784     
                                                                 
 leaky_re_lu_165 (LeakyReLU  (None, 60, 66, 64)        0         
 )                                                               
                                                                 
 max_pooling2d_129 (MaxPool  (None, 30, 33, 64)        0         
 ing2D)                                                          
                                                                 
 conv2d_168 (Conv2D)         (None, 30, 33, 128)       123008    
          

  saving_api.save_model(


Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_169 (Conv2D)         (None, 120, 226, 32)      512       
                                                                 
 leaky_re_lu_167 (LeakyReLU  (None, 120, 226, 32)      0         
 )                                                               
                                                                 
 max_pooling2d_131 (MaxPool  (None, 60, 113, 32)       0         
 ing2D)                                                          
                                                                 
 conv2d_170 (Conv2D)         (None, 60, 113, 64)       30784     
                                                                 
 leaky_re_lu_168 (LeakyReLU  (None, 60, 113, 64)       0         
 )                                                               
                                                     