In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 5.5
fig_height = 3.5
fig_format = 'pdf'
fig_dpi = 300

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/francoisderyckel/Documents/quant-sandbox/python':
  os.chdir(r'/Users/francoisderyckel/Documents/quant-sandbox/python')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


  set_matplotlib_formats(fig_format)




In [2]:
#| label: loading-data

# Loading the required library 
import pandas as pd
import numpy as np
import yfinance as yf

rio = yf.download('RIO')
df = pd.DataFrame(rio)

[*********************100%%**********************]  1 of 1 completed




In [3]:
#| label: initial-eda

# get to know df variables
print('The shape of dataframe is:', df.shape)
print(df.head())

# checking for missing values. 
df.isnull().sum()

The shape of dataframe is: (8539, 6)
                Open      High       Low     Close  Adj Close  Volume
Date                                                                 
1990-06-28  10.09375  10.09375   9.96875   9.96875   1.916215  176400
1990-06-29  10.03125  10.06250  10.00000  10.06250   1.934235   69200
1990-07-02  10.00000  10.03125  10.00000  10.03125   1.928228   62000
1990-07-03  10.03125  10.06250  10.03125  10.06250   1.934235   29600
1990-07-05   9.71875   9.71875   9.65625   9.68750   1.862153   31200


Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [4]:
#| label: plot01

import matplotlib.pyplot as plt
from datetime import date
from dateutil.relativedelta import relativedelta

# Checking only over the last 5 years of data 
start_date = date.today() + relativedelta(days = int(np.round(-5 * 365)))
end_date = date.today()

# first vizualization 
plt.plot(df['Adj Close'].loc[start_date:end_date])
plt.show()

<Figure size 1650x1050 with 1 Axes>

In [5]:
#| label: candlestick-plot01

import mplfinance as mpl

start_date = date.today() + relativedelta(days = int(np.round(-0.5 * 365)))

df_last18months = df.loc[start_date:end_date]

fig, axlist = mpl.plot(
  df_last18months, 
  type = 'candle', 
  style='binance', 
  figratio = (15, 7), 
  returnfig = True
)

fig = fig.suptitle(
  f"$RIO stock between {start_date} and {end_date}", 
  y=.95, 
  fontsize=15, 
  weight = 'semibold', 
  style = 'normal'
)

mpl.show()

<Figure size 1232.14x575 with 2 Axes>

In [6]:
# getting returns 
df['ret_1d'] = np.log(df['Adj Close']).diff()

# Creating new variables for intraday 
df['o-c'] = df.Open - df.Close
df['h-l'] = df.High - df.Low


# create returns and standard deviations of returns 
for days in range(2, 62, 3): 
  df['ret_'+ str(days)] = df.ret_1d.rolling(days).sum()
  df['sd_' + str(days)] = df.ret_1d.rolling(days).std()


In [7]:
#| label: create-target
df2 = df.copy()

df.dropna(inplace = True)

# Only use last 5 years of data as suggested by exam instructions
df = df.loc['2019-01-01':]

#We will use the median returns to ensure we have balanced class 

df['ret_1d'].describe()

# find median
median_return = df['ret_1d'].dropna().median()
df['target'] = np.where(df['ret_1d'] > median_return, 1, 0)
df['target'] = df['target'].shift(-1)

# checking we have the right ratio. 
df['target'].value_counts(normalize = True).round(4) * 100

target
1.0    50.04
0.0    49.96
Name: proportion, dtype: float64

In [8]:
# removing NaN values
df.dropna(inplace = True)

# Only use last 5 years of data as suggested by exam instructions
df = df.loc['2019-01-01':]

# checking we have the right ratio. 
df['target'].value_counts(normalize = True).round(4) * 100

target
1.0    50.04
0.0    49.96
Name: proportion, dtype: float64

In [9]:
#| label: training-testing


df.info() 

X = df.drop(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'target'], axis = 1).values
y = df['target'].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

print(f"Size of training set is {len(x_train)} and size of testing set is {len(x_test)}")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1355 entries, 2019-01-02 to 2024-05-20
Data columns (total 50 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1355 non-null   float64
 1   High       1355 non-null   float64
 2   Low        1355 non-null   float64
 3   Close      1355 non-null   float64
 4   Adj Close  1355 non-null   float64
 5   Volume     1355 non-null   int64  
 6   ret_1d     1355 non-null   float64
 7   o-c        1355 non-null   float64
 8   h-l        1355 non-null   float64
 9   ret_2      1355 non-null   float64
 10  sd_2       1355 non-null   float64
 11  ret_5      1355 non-null   float64
 12  sd_5       1355 non-null   float64
 13  ret_8      1355 non-null   float64
 14  sd_8       1355 non-null   float64
 15  ret_11     1355 non-null   float64
 16  sd_11      1355 non-null   float64
 17  ret_14     1355 non-null   float64
 18  sd_14      1355 non-null   float64
 19  ret_17     1355 non-null   flo

Size of training set is 1084 and size of testing set is 271


In [10]:
#| label: base-model

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC

model_svm_lin = Pipeline([
  ('Scaler', MinMaxScaler()), 
  ('std_scaler', StandardScaler()), 
  #("pca", PCA(n_components=0.9)),
  ('classifier', LinearSVC(dual = False))
])

model_svm_lin.fit(x_train, y_train)
#x_train_transf = Pipeline.fit_transform(model_svm_lin) 

y_pred = model_svm_lin.predict(x_test)

In [11]:
from sklearn.metrics import roc_curve, ConfusionMatrixDisplay, accuracy_score, auc

acc_train = accuracy_score(y_train, model_svm_lin.predict(x_train))
acc_test = accuracy_score(y_test, y_pred)

print(acc_train)
print(acc_test)

0.5599630996309963
0.5202952029520295


In [12]:
conf_mat = ConfusionMatrixDisplay.from_estimator(
  model_svm_lin, x_train, y_train, cmap = plt.cm.Blues
)
plt.title('Confusion Matrix on Base Model')
plt.show()

<Figure size 640x480 with 2 Axes>

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.53      0.68      0.59       140
         1.0       0.51      0.35      0.41       131

    accuracy                           0.52       271
   macro avg       0.52      0.51      0.50       271
weighted avg       0.52      0.52      0.51       271



In [14]:
#| label: hyper-param

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score

tscv = TimeSeriesSplit(n_splits = 5, gap = 1)

param_grid = {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5], 
              'gamma': [0.001, 0.0075, 0.01, 0.05, 0.075, 0.1, 0.5, 1], 
              'kernel': ['rbf', 'poly', 'linear']
}

grid = GridSearchCV(SVC(), param_grid, verbose = 1, scoring = 'accuracy', cv = tscv)
grid.fit(x_train, y_train)

grid.best_params_
grid.best_score_

Fitting 5 folds for each of 192 candidates, totalling 960 fits


0.53

In [15]:
svm_best_param = SVC(**grid.best_params_)
svm_best_param.fit(x_train, y_train, 
                   #eval_set = [(x_train, y_train), (x_test, y_test)], 
                   #verbose = True
                   )
x_val_score = cross_val_score(svm_best_param, x_train, y_train, cv = tscv)

x_val_score 

array([0.57222222, 0.55555556, 0.53888889, 0.51111111, 0.47222222])

In [16]:
yo = df.drop(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'target', 'ret_1d'], axis = 1).values

xx_train, xx_test, yy_train, yy_test = train_test_split(yo, y, test_size = 0.2, shuffle = False)

model_svm_lin.fit(xx_train, yy_train)
yy_pred = model_svm_lin.predict(xx_test)
acc_train = accuracy_score(yy_train, model_svm_lin.predict(xx_train))
acc_test = accuracy_score(yy_test, yy_pred)

print(acc_train)
print(acc_test)

0.5701107011070111
0.5092250922509225
