In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import scorer, accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from subprocess import call
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
%tensorflow_version 2.x

# Data Creation
- wml = returns of WinnerMinusLoser portfolio
- iml = returns of IlliquidityMinusLiquidity portfolio
- market_value = market return

In [None]:
wml_data = pd.read_csv('wml.csv', delimiter=',', header=0)
iml_data = pd.read_csv('iml.csv', delimiter=',', header=0)
target = pd.read_csv('returns.csv', delimiter=',', header=0)
market_value = pd.read_csv('market_value.csv', delimiter=',', header=0)

target = target[213:-1]

# 213 is index of 01.01.2009.
data = pd.DataFrame({'iml': iml_data['iml'][213:]})
new_col = pd.DataFrame({'wml': wml_data['wml'][213:]})
data = data.join(new_col)
data = data.reset_index()
mv_data = market_value['Adj Close']
new_col = pd.DataFrame({'market_value': mv_data})
data = data.join(new_col)
data = data.dropna()
data = data.reset_index(drop=True)
target = target.reset_index(drop=True)

stocks = target.columns.values

In [None]:
data

# LINEAR REGRESSION

In [None]:
X = data[['iml', 'wml', 'market_value']].values

for Y in stocks[1:]:

    X_train, X_test, y_train, y_test = train_test_split(X, target[Y].values[1:], test_size=0.2, random_state=0)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)  # training the algorithm
    coeffs = (regressor.intercept_, regressor.coef_)
    y_pred = regressor.predict(X_test)
    print('-----' + Y + '------')
    print('Mean Squared Error:' , mean_squared_error(y_test, y_pred))
    print('R2_score:', r2_score(y_test, y_pred))

# DECISION TREE


In [None]:
X = data[['iml', 'wml', 'market_value']].values

# List of values to try for max_depth:
max_depth_range = list(range(1, 6))
accuracy_total = []
for Y in stocks[1:]:
    X_train, X_test, y_train, y_test = train_test_split(X, target[Y].values[1:], test_size=0.2, random_state=0, shuffle=False)

    # List to store the average RMSE for each value of max_depth:
    accuracy = []
    for depth in max_depth_range:
        clf = DecisionTreeRegressor(max_depth=depth, random_state=0)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        accuracy.append(score)

    # index + 1 = best depth
    max_index = accuracy.index(max(accuracy)) + 1
    dt = DecisionTreeRegressor(max_depth=max_index, random_state=0)
    dt.fit(X_train, y_train)
    score = dt.score(X_test, y_test)
    accuracy_total.append(score)
    # graph vizualized only for one stock --> copy from .dot file u http://webgraphviz.com/
    if Y == 'WTM':
        export_graphviz(dt, out_file='tree.dot', feature_names=['iml', 'wml', 'market_value'])

        call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

        # Display in python
        plt.figure(figsize = (14, 18))
        plt.imshow(plt.imread('tree.png'))
        plt.axis('off');
        plt.show();

plt.plot(stocks[1:], accuracy_total)
plt.show()

# SVM

In [None]:
X = data[['iml', 'wml', 'market_value']].values

for Y in stocks[1:]:
    X_train, X_test, Y_train, Y_test = train_test_split(X, target[Y].values[1:], test_size=0.2, random_state=0, shuffle=False)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    clf = SVR().fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    # create empty table with 2 fields --> nesto da namistim dimenzije
    #helper = np.zeros(shape=(len(y_pred), 3) )
    # put the predicted values in the right field
    #helper[:,0] = y_pred
    # inverse transform and then select the right field
    #y_pred = scaler.inverse_transform(y_pred)

    plt.plot(Y_test, color = 'black', label = 'Test data return')
    plt.plot(y_pred, color = 'green', label = 'Predicted data return')
    plt.title('Prediction of returns with SVM')
    plt.xlabel('Time')
    plt.ylabel('Data return')
    plt.legend()
    plt.show()

# MULTILAYER PERCEPTRON

In [None]:
scaler = MinMaxScaler(feature_range = (0, 1))

# 213 is index of 01/01/2009
X = data[['iml', 'wml', 'market_value']].values
for Y in stocks[1:]:
    print('-----' + Y + '------')
    X_train, X_test, Y_train, Y_test = train_test_split(X, target[Y].values[1:], test_size=0.2, random_state=0)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, shuffle=False)

    # scale data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)
    
    # define model
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(1, activation=tf.nn.relu))
    model.compile(optimizer="adam", loss="mean_squared_error")
    #fit model
    model.fit(X_train, Y_train, epochs=100)
    #evaluate model on test data
    model.evaluate(X_test, Y_test)

    # backtest the model
    y_pred = model.predict(X_val)
    # create empty table with 2 fields --> nesto da namistim dimenzije
    helper = np.zeros(shape=(len(y_pred), 3) )
    # put the predicted values in the right field
    helper[:,0] = y_pred[:,0]
    # inverse transform and then select the right field
    y_pred = scaler.inverse_transform(helper)[:,0]
    plt.plot(Y_val, color = 'black', label = 'Validation data return')
    plt.plot(y_pred, color = 'green', label = 'Predicted data return')
    plt.title('Prediction of returns with MLP')
    plt.xlabel('Time')
    plt.ylabel('Data return')
    plt.legend()
    plt.show()

# LSTM

In [None]:
X = data[['iml', 'wml', 'market_value']].values

for Y in stocks[1:]:
    print('-----' + Y + '------')
    X_train, X_test, Y_train, Y_test = train_test_split(X, target[Y].values[1:], test_size=0.2, random_state=0, shuffle=False)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state = 0, shuffle=False)
    
    # scale data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

    # reshape data
    X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
    X_val = X_val.reshape(X_val.shape[0],X_val.shape[1],1)

    # define model
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.LSTM(20, input_shape=(X_train.shape[1], 1), return_sequences=True))
    model.add(tf.keras.layers.LSTM(20))
    model.add(tf.keras.layers.Dense(1, activation=tf.nn.relu))
    model.compile(optimizer="adam", loss="mean_squared_error")
    # fit model
    model.fit(X_train, Y_train, epochs = 100, batch_size = 32)
    # evaluate model on test data
    model.evaluate(X_test, Y_test)

    # backtest the model
    y_pred = model.predict(X_val)
    # create empty table with 2 fields --> nesto da namistim dimenzije
    helper = np.zeros(shape=(len(y_pred), 3) )
    # put the predicted values in the right field
    helper[:,0] = y_pred[:,0]
    # inverse transform and then select the right field
    y_pred = scaler.inverse_transform(helper)[:,0]
    plt.plot(Y_val, color = 'black', label = 'Validation data return')
    plt.plot(y_pred, color = 'green', label = 'Predicted data return')
    plt.title('Prediction of returns with LSTM')
    plt.xlabel('Time')
    plt.ylabel('Data return')
    plt.legend()
    plt.show()