In [19]:
############################### Import Libraries ###########################
import pandas as pd
import matplotlib.pyplot as plt
import pandas_ta as ta
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

# Load Data

In [15]:
class dataPreProcessing:

    def __init__(self, path) -> None:
        self. path = path 
        

    def dataLoader(self):
        data = pd.read_csv(self. path, index_col=0)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True, drop=True)
        return(data)
    
    def featureExtraction(self, data):
        data['MA60'] = data['Close'].rolling(window=60).mean()
        data['EMA20'] = data['Close'].ewm(span=20, adjust=False).mean()
        data['RSI'] = ta.rsi(data['Close'], length=14)
        macd = ta.macd(data['Close'], fast=12, slow=26, signal=9)
        data['MACD'] = macd['MACD_12_26_9']
        data['MACD_signal'] = macd['MACDs_12_26_9']
        bbands = ta.bbands(self. data['Close'], length=20, std=2)
        data['BB_Middle'] = bbands['BBM_20_2.0']  # Middle Band
        data['BB_Upper'] = bbands['BBU_20_2.0']   # Upper Band
        data['BB_Lower'] = bbands['BBL_20_2.0']   # Lower Band

        return(data)

    def normlalization(self, data):

        for column in list(data.columns):
            min = data[column].min()
            max = data[column].max()
            data[column] = (data[column] - min) / (max - min)
            


In [17]:
path = r'data/test5_1d.csv'
dataFunction = dataPreProcessing(path) 
data = dataFunction.dataLoader()
print(data.head())



            Volume  Open  High       Low  Close
Date                                           
2018-01-02    8944  8.05  8.13  7.730000   7.79
2018-01-03    7528  7.75  7.97  7.730000   7.82
2018-01-04    7649  7.89  7.92  7.740000   7.76
2018-01-05    5342  7.75  7.84  7.740000   7.78
2018-01-08   12789  7.76  7.92  7.609231   7.67


In [9]:
path = r'data/test5_1d.csv'
data = pd.read_csv(path, index_col=0)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True, drop=True)

print(data.head())

            Volume  Open  High       Low  Close
Date                                           
2018-01-02    8944  8.05  8.13  7.730000   7.79
2018-01-03    7528  7.75  7.97  7.730000   7.82
2018-01-04    7649  7.89  7.92  7.740000   7.76
2018-01-05    5342  7.75  7.84  7.740000   7.78
2018-01-08   12789  7.76  7.92  7.609231   7.67


# Add Features

In [10]:
#################### Add Features to data ###################
data['MA60'] = data['Close'].rolling(window=60).mean()
data['EMA20'] = data['Close'].ewm(span=20, adjust=False).mean()
data['RSI'] = ta.rsi(data['Close'], length=14)
macd = ta.macd(data['Close'], fast=12, slow=26, signal=9)
data['MACD'] = macd['MACD_12_26_9']
data['MACD_signal'] = macd['MACDs_12_26_9']
bbands = ta.bbands(data['Close'], length=20, std=2)

print(data)

            Volume   Open       High        Low     Close       MA60  \
Date                                                                   
2018-01-02    8944   8.05   8.130000   7.730000   7.79000        NaN   
2018-01-03    7528   7.75   7.970000   7.730000   7.82000        NaN   
2018-01-04    7649   7.89   7.920000   7.740000   7.76000        NaN   
2018-01-05    5342   7.75   7.840000   7.740000   7.78000        NaN   
2018-01-08   12789   7.76   7.920000   7.609231   7.67000        NaN   
...            ...    ...        ...        ...       ...        ...   
2023-06-05   23218  79.05  83.410000  78.010000  82.86000  88.995514   
2023-06-06   17637  83.00  83.480000  80.280000  81.64000  88.686681   
2023-06-07   17777  81.64  84.092308  80.080000  83.86000  88.492972   
2023-06-08   15161  83.99  84.950000  82.569259  83.35000  88.334306   
2023-06-09    7723  83.44  85.630000  83.440000  85.12625  88.265910   

                EMA20        RSI      MACD  MACD_signal  
Date 

# Eliminate columns with high correlations 

In [11]:
data['BB_Middle'] = bbands['BBM_20_2.0']  # Middle Band
data['BB_Upper'] = bbands['BBU_20_2.0']   # Upper Band
data['BB_Lower'] = bbands['BBL_20_2.0']   # Lower Band

dada = data.drop(columns=['Open', 'High', 'Low', 'Close', 'MACD_signal', 'BB_Upper', 'BB_Lower'], inplace=True)
data.dropna(inplace=True)


print(data)

            Volume       MA60      EMA20        RSI      MACD  BB_Middle
Date                                                                    
2018-03-26   13135   9.622167  11.323217  79.800600  0.776014  11.162500
2018-03-27   19922   9.719841  11.544861  83.348003  0.865848  11.337024
2018-03-28   34621   9.806508  11.685350  71.223704  0.876068  11.483024
2018-03-29   13901   9.899174  11.841031  73.219933  0.898024  11.649524
2018-04-03   17698   9.991174  11.979981  72.857072  0.903396  11.808024
...            ...        ...        ...        ...       ...        ...
2023-06-05   23218  88.995514  84.153658  44.078094 -2.334675  85.268500
2023-06-06   17637  88.686681  83.914262  41.450316 -2.244482  84.975000
2023-06-07   17777  88.492972  83.909094  47.574981 -1.971145  84.715333
2023-06-08   15161  88.334306  83.855847  46.374813 -1.775214  84.454333
2023-06-09    7723  88.265910  83.976838  51.010214 -1.459780  84.264146

[1296 rows x 6 columns]


# target claculation

In [13]:
data['target'] = data['MA60'] - data['MA60'].shift(-1)
data.dropna(inplace=True)

print(data)

            Volume       MA60      EMA20        RSI      MACD  BB_Middle  \
Date                                                                       
2018-03-26   13135   9.622167  11.323217  79.800600  0.776014  11.162500   
2018-03-27   19922   9.719841  11.544861  83.348003  0.865848  11.337024   
2018-03-28   34621   9.806508  11.685350  71.223704  0.876068  11.483024   
2018-03-29   13901   9.899174  11.841031  73.219933  0.898024  11.649524   
2018-04-03   17698   9.991174  11.979981  72.857072  0.903396  11.808024   
...            ...        ...        ...        ...       ...        ...   
2023-06-01   21375  89.591097  84.895078  30.048842 -2.325390  85.726000   
2023-06-02   15627  89.269514  84.289832  29.351471 -2.540053  85.405000   
2023-06-05   23218  88.995514  84.153658  44.078094 -2.334675  85.268500   
2023-06-06   17637  88.686681  83.914262  41.450316 -2.244482  84.975000   
2023-06-07   17777  88.492972  83.909094  47.574981 -1.971145  84.715333   

           

# data normalization

In [14]:
for column in list(data.columns):

    min = data[column].min()
    max = data[column].max()
    data[column] = (data[column] - min) / (max - min)

print(data)

              Volume      MA60     EMA20       RSI      MACD  BB_Middle  \
Date                                                                      
2018-03-26  0.126773  0.000000  0.000000  0.882223  0.573366   0.000000   
2018-03-27  0.195540  0.001163  0.002623  0.929866  0.580917   0.002040   
2018-03-28  0.344472  0.002196  0.004285  0.767030  0.581776   0.003746   
2018-03-29  0.134534  0.003299  0.006127  0.793841  0.583621   0.005692   
2018-04-03  0.173006  0.004395  0.007771  0.788967  0.584073   0.007544   
...              ...       ...       ...       ...       ...        ...   
2023-06-01  0.210262  0.952429  0.870559  0.214030  0.312671   0.871435   
2023-06-02  0.152022  0.948599  0.863397  0.204664  0.294627   0.867683   
2023-06-05  0.228935  0.945336  0.861786  0.402451  0.311891   0.866088   
2023-06-06  0.172388  0.941657  0.858953  0.367158  0.319472   0.862658   
2023-06-07  0.173806  0.939350  0.858892  0.449416  0.342448   0.859623   

              target  
D

# train and test seperation

In [None]:
x = data.drop(columns=['target'])  # Features
y = data['target']  # Target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)