In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import talib
import yfinance as yf

from joblib import dump, load
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

## Download data from yfinance

In [4]:
# Download BBRI historical stock data in the last 5 years 
df = yf.download('BBRI.JK', start='2018-06-04', end='2023-06-08')
df.tail()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-05-30,5500.0,5600.0,5500.0,5500.0,5500.0,67363200
2023-05-31,5575.0,5575.0,5375.0,5575.0,5575.0,898453700
2023-06-05,5600.0,5600.0,5425.0,5425.0,5425.0,255472600
2023-06-06,5400.0,5500.0,5350.0,5450.0,5450.0,92019300
2023-06-07,5475.0,5475.0,5350.0,5400.0,5400.0,150377000


In [6]:
df.index = pd.to_datetime(df.index)

## Feature Extraction

In [5]:
def generate_TAs_features(df):
    """
    Function to generate Technical Analysis features:
    - MA (Moving Average)
    - RSI (Relative Strength Index)
    - MFI (Money Flow Index)
    """
    res = df.copy()
    for i in [7, 14, 21]:
        # Moving Average
        res[f'{i} DAYS MA'] = talib.MA(res['Close'], timeperiod=i)
        res[f'{i} DAYS MA'] = res[f'{i} DAYS MA'].shift(1)
        
        # RSI (Relative Strength Index)
        res[f'RSI {i}'] = talib.RSI(res['Close'], timeperiod=i)
        
        # MFI (Money Flow Index)
        res[f'MFI {i}'] = talib.MFI(res['High'], res['Low'], res['Close'], res['Volume'], timeperiod=i)
        
        # Standard Deviation
        if i == 7:
            res[f'{i} DAYS STD DEV'] = res['Close'].rolling(i).std()
            res[f'{i} DAYS STD DEV'] = res[f'{i} DAYS STD DEV'].shift(1)
    
    print(res.isnull().sum())
    res = res[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',            
               '7 DAYS MA', '14 DAYS MA', '21 DAYS MA', '7 DAYS STD DEV',
               'RSI 7', 'RSI 14', 'RSI 21',            
               'MFI 7', 'MFI 14', 'MFI 21']]
    res = res.dropna()
    return res

def generate_date_related_features(df):
    """
    Function to generate date related features
    """
    res = df.copy()
    res['dayofweek'] = df.index.dayofweek
    res['quarter'] = df.index.quarter
    res['month'] = df.index.month
    res['year'] = df.index.year
    res['dayofyear'] = df.index.dayofyear
    res['dayofmonth'] = df.index.day
    res['weekofyear'] = df.index.isocalendar().week
    return res

In [8]:
df2 = generate_TAs_features(df)
df2.head()

Open               0
High               0
Low                0
Close              0
Adj Close          0
Volume             0
7 DAYS MA          7
RSI 7              7
MFI 7              7
7 DAYS STD DEV     7
14 DAYS MA        14
RSI 14            14
MFI 14            14
21 DAYS MA        21
RSI 21            21
MFI 21            21
dtype: int64


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-07-03,2850.0,2860.0,2770.0,2830.0,2345.487061,88835200,2852.857143,2967.142857,3029.52381,70.169862,36.571029,36.001035,35.0,45.016324,39.047756,39.874572
2018-07-04,2830.0,2950.0,2790.0,2930.0,2428.366211,128906900,2831.428571,2945.0,3015.238095,42.201332,52.449496,44.944537,41.176471,57.56565,43.082041,45.211221
2018-07-05,2890.0,2920.0,2860.0,2910.0,2411.790283,63494300,2837.142857,2930.0,3006.666667,52.824958,49.554923,43.631287,40.37088,71.298886,44.883072,44.24344
2018-07-06,2910.0,2910.0,2840.0,2840.0,2353.774902,76368500,2850.0,2913.571429,2995.238095,58.594653,40.441436,39.302705,37.662952,61.189487,43.255768,38.007864
2018-07-09,2860.0,3030.0,2860.0,3010.0,2494.669678,127889400,2850.0,2892.142857,2974.761905,58.594653,60.844278,51.807182,46.767993,80.879319,46.614146,46.614146


In [9]:
df3 = generate_date_related_features(df2)
df3.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,...,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-03,2850.0,2860.0,2770.0,2830.0,2345.487061,88835200,2852.857143,2967.142857,3029.52381,70.169862,...,45.016324,39.047756,39.874572,1,3,7,2018,184,3,27
2018-07-04,2830.0,2950.0,2790.0,2930.0,2428.366211,128906900,2831.428571,2945.0,3015.238095,42.201332,...,57.56565,43.082041,45.211221,2,3,7,2018,185,4,27
2018-07-05,2890.0,2920.0,2860.0,2910.0,2411.790283,63494300,2837.142857,2930.0,3006.666667,52.824958,...,71.298886,44.883072,44.24344,3,3,7,2018,186,5,27
2018-07-06,2910.0,2910.0,2840.0,2840.0,2353.774902,76368500,2850.0,2913.571429,2995.238095,58.594653,...,61.189487,43.255768,38.007864,4,3,7,2018,187,6,27
2018-07-09,2860.0,3030.0,2860.0,3010.0,2494.669678,127889400,2850.0,2892.142857,2974.761905,58.594653,...,80.879319,46.614146,46.614146,0,3,7,2018,190,9,28


In [11]:
df4 = df3.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1)
df4.head()

Unnamed: 0_level_0,Close,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-07-03,2830.0,2852.857143,2967.142857,3029.52381,70.169862,36.571029,36.001035,35.0,45.016324,39.047756,39.874572,1,3,7,2018,184,3,27
2018-07-04,2930.0,2831.428571,2945.0,3015.238095,42.201332,52.449496,44.944537,41.176471,57.56565,43.082041,45.211221,2,3,7,2018,185,4,27
2018-07-05,2910.0,2837.142857,2930.0,3006.666667,52.824958,49.554923,43.631287,40.37088,71.298886,44.883072,44.24344,3,3,7,2018,186,5,27
2018-07-06,2840.0,2850.0,2913.571429,2995.238095,58.594653,40.441436,39.302705,37.662952,61.189487,43.255768,38.007864,4,3,7,2018,187,6,27
2018-07-09,3010.0,2850.0,2892.142857,2974.761905,58.594653,60.844278,51.807182,46.767993,80.879319,46.614146,46.614146,0,3,7,2018,190,9,28


In [12]:
# Move Close column to the back
df4['Close_'] = df4['Close']
df4 = df4.drop(['Close'], axis=1)
df4 = df4.rename(columns={'Close_': 'Close'})
df4.head()

Unnamed: 0_level_0,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-07-03,2852.857143,2967.142857,3029.52381,70.169862,36.571029,36.001035,35.0,45.016324,39.047756,39.874572,1,3,7,2018,184,3,27,2830.0
2018-07-04,2831.428571,2945.0,3015.238095,42.201332,52.449496,44.944537,41.176471,57.56565,43.082041,45.211221,2,3,7,2018,185,4,27,2930.0
2018-07-05,2837.142857,2930.0,3006.666667,52.824958,49.554923,43.631287,40.37088,71.298886,44.883072,44.24344,3,3,7,2018,186,5,27,2910.0
2018-07-06,2850.0,2913.571429,2995.238095,58.594653,40.441436,39.302705,37.662952,61.189487,43.255768,38.007864,4,3,7,2018,187,6,27,2840.0
2018-07-09,2850.0,2892.142857,2974.761905,58.594653,60.844278,51.807182,46.767993,80.879319,46.614146,46.614146,0,3,7,2018,190,9,28,3010.0


## Data Standardization

### Standard Scaler

In [13]:
standard_df = df4.copy()
standard_df.head()

Unnamed: 0_level_0,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-07-03,2852.857143,2967.142857,3029.52381,70.169862,36.571029,36.001035,35.0,45.016324,39.047756,39.874572,1,3,7,2018,184,3,27,2830.0
2018-07-04,2831.428571,2945.0,3015.238095,42.201332,52.449496,44.944537,41.176471,57.56565,43.082041,45.211221,2,3,7,2018,185,4,27,2930.0
2018-07-05,2837.142857,2930.0,3006.666667,52.824958,49.554923,43.631287,40.37088,71.298886,44.883072,44.24344,3,3,7,2018,186,5,27,2910.0
2018-07-06,2850.0,2913.571429,2995.238095,58.594653,40.441436,39.302705,37.662952,61.189487,43.255768,38.007864,4,3,7,2018,187,6,27,2840.0
2018-07-09,2850.0,2892.142857,2974.761905,58.594653,60.844278,51.807182,46.767993,80.879319,46.614146,46.614146,0,3,7,2018,190,9,28,3010.0


In [14]:
scaler1 = StandardScaler()
features = standard_df.columns[:-1]
standard_df[features] = scaler1.fit_transform(standard_df[features])

In [15]:
close_scaler1 = StandardScaler()
standard_df['Close'] = close_scaler1.fit_transform(np.array(standard_df['Close']).reshape(-1,1))

In [16]:
standard_df.head()

Unnamed: 0_level_0,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-07-03,-1.974889,-1.797427,-1.703206,-0.068369,-1.029261,-1.500896,-1.93848,-0.32104,-0.809956,-0.918281,-0.692708,0.419265,0.124549,-1.637427,-0.003251,-1.450117,0.016076,-2.004093
2018-07-04,-2.010393,-1.834608,-1.727495,-0.646393,-0.046983,-0.709149,-1.258889,0.233188,-0.566193,-0.528693,0.015059,0.419265,0.124549,-1.637427,0.006141,-1.335784,0.016076,-1.840288
2018-07-05,-2.000925,-1.859794,-1.742069,-0.426835,-0.226048,-0.825408,-1.347528,0.839702,-0.45737,-0.599344,0.722825,0.419265,0.124549,-1.637427,0.015532,-1.221452,0.016076,-1.873049
2018-07-06,-1.979623,-1.88738,-1.761501,-0.307593,-0.789829,-1.208607,-1.645478,0.393231,-0.555696,-1.054555,1.430592,0.419265,0.124549,-1.637427,0.024924,-1.107119,0.016076,-1.987713
2018-07-09,-1.979623,-1.92336,-1.796316,-0.307593,0.472338,-0.101616,-0.64366,1.262812,-0.352773,-0.426277,-1.400474,0.419265,0.124549,-1.637427,0.053099,-0.764121,0.081779,-1.709244


### Robust Scaler

In [17]:
robust_df = df4.copy()
robust_df.head()

Unnamed: 0_level_0,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-07-03,2852.857143,2967.142857,3029.52381,70.169862,36.571029,36.001035,35.0,45.016324,39.047756,39.874572,1,3,7,2018,184,3,27,2830.0
2018-07-04,2831.428571,2945.0,3015.238095,42.201332,52.449496,44.944537,41.176471,57.56565,43.082041,45.211221,2,3,7,2018,185,4,27,2930.0
2018-07-05,2837.142857,2930.0,3006.666667,52.824958,49.554923,43.631287,40.37088,71.298886,44.883072,44.24344,3,3,7,2018,186,5,27,2910.0
2018-07-06,2850.0,2913.571429,2995.238095,58.594653,40.441436,39.302705,37.662952,61.189487,43.255768,38.007864,4,3,7,2018,187,6,27,2840.0
2018-07-09,2850.0,2892.142857,2974.761905,58.594653,60.844278,51.807182,46.767993,80.879319,46.614146,46.614146,0,3,7,2018,190,9,28,3010.0


In [18]:
scaler2 = RobustScaler()
features = robust_df.columns[:-1]
robust_df[features] = scaler2.fit_transform(robust_df[features])

In [19]:
close_scaler2 = RobustScaler()
robust_df['Close'] = close_scaler2.fit_transform(np.array(robust_df['Close']).reshape(-1,1))

In [20]:
robust_df.head()

Unnamed: 0_level_0,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV,RSI 7,RSI 14,RSI 21,MFI 7,MFI 14,MFI 21,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-07-03,-1.623797,-1.470178,-1.405119,0.210469,-0.697101,-1.069752,-1.394933,-0.215294,-0.499868,-0.556399,-0.5,0.0,0.0,-0.666667,-0.026631,-0.866667,0.0,-1.730769
2018-07-04,-1.650044,-1.49717,-1.422764,-0.397901,-0.006714,-0.514807,-0.909048,0.147305,-0.344051,-0.305758,0.0,0.0,0.0,-0.666667,-0.021305,-0.8,0.0,-1.602564
2018-07-05,-1.643045,-1.515455,-1.433351,-0.166816,-0.132568,-0.596295,-0.972422,0.544112,-0.27449,-0.351211,0.5,0.0,0.0,-0.666667,-0.015979,-0.733333,0.0,-1.628205
2018-07-06,-1.627297,-1.535481,-1.447466,-0.041314,-0.528818,-0.864883,-1.185446,0.252011,-0.337341,-0.644071,1.0,0.0,0.0,-0.666667,-0.010652,-0.666667,0.0,-1.717949
2018-07-09,-1.627297,-1.561602,-1.472757,-0.041314,0.358287,-0.088981,-0.469179,0.820928,-0.207631,-0.239868,-1.0,0.0,0.0,-0.666667,0.005326,-0.466667,0.037037,-1.5


## Save data

In [22]:
dump(scaler1, 'standard_scaler.bin', compress=True)
dump(close_scaler1, 'close_standard_scaler.bin', compress=True)

dump(scaler2, 'robust_scaler.bin', compress=True)
dump(close_scaler2, 'close_robust_scaler.bin', compress=True)

['close_robust_scaler.bin']

In [24]:
test_loaded_sc = load('close_standard_scaler.bin')
test_loaded_sc.inverse_transform(np.array(standard_df['Close']).reshape(-1,1))[0]

array([2830.])

In [25]:
test_loaded_sc = load('close_robust_scaler.bin')
test_loaded_sc.inverse_transform(np.array(robust_df['Close']).reshape(-1,1))[0]

array([2830.])

In [26]:
standard_df.to_csv('standard.csv')
robust_df.to_csv('robust.csv')