In [24]:
from IPython.core.debugger import set_trace

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

plt.style.use(style="seaborn")
%matplotlib inline

In [25]:
df = pd.read_csv("data/TSLA-1Y.csv")

In [26]:
df.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-04-06,102.239998,104.199997,99.592003,103.248001,103.248001,74509000
1,2020-04-07,109.0,113.0,106.468002,109.089996,109.089996,89599000
2,2020-04-08,110.839996,111.442001,106.666,109.767998,109.767998,63280000
3,2020-04-09,112.417999,115.036003,111.421997,114.599998,114.599998,68250000
4,2020-04-13,118.031998,130.399994,116.106003,130.190002,130.190002,112377000


In [27]:
df = df[["Close"]].copy() # we only want the close

In [28]:
df.head(5)

Unnamed: 0,Close
0,103.248001
1,109.089996
2,109.767998
3,114.599998
4,130.190002


In [29]:
df['target'] = df.Close.shift(-1)

In [30]:
df.dropna(inplace = True)

In [31]:
df.head(5) # so that we can try to predict the next day close price

Unnamed: 0,Close,target
0,103.248001,109.089996
1,109.089996,109.767998
2,109.767998,114.599998
3,114.599998,130.190002
4,130.190002,141.977997


In [32]:
def train_test_split(data, perc): #perc is %of test
    data = data.values
    n = int(len(data) * (1 - perc))
    return data[:n], data[n:] #return train, test

In [33]:
train, test = train_test_split(df, 0.2)

In [34]:
print(len(df))
print(len(train))
print(len(test))

249
199
50


In [35]:
X = train[:, :-1]
y = train[:, -1]

from xgboost import XGBRegressor

In [36]:
model = XGBRegressor(objective="reg:squarederror", n_estimators=1000)
# fit the model
model.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
test[0]

array([850.450012, 844.98999 ])

In [38]:
val = np.array(test[0, 0]).reshape(1, -1)

pred = model.predict(val)
print(pred[0])

854.4084


In [40]:
#Train on train set and predict one sample at a time
def xgb_predict(train, val):
    train = np.array(train)
    X, y = train[:, :-1], train[:, -1]
    model = XGBRegressor(objective="reg:squarederror", n_estimators=1000)
    model.fit(X, y)

    val = np.array(val).reshape(1, -1)
    pred = model.predict(val)
    return pred[0]

In [41]:
xgb_predict(train, test[0, 0])

854.4084

In [47]:
#### Walk-forward validation

#Since we are making a one step forward prediction, in this case a daily prediction we will
#predict the first record in the test dataset. 

#Afterwards we add the real observation from the test set to the train set, refit the model, then predict the next step in the test dataset.

#We'll evaluate the model with the RMSE metric. (Root Mean Squared Error)

In [48]:
from sklearn.metrics import mean_squared_error


def validate(data, perc):
    predictions = []

    train, test = train_test_split(data, perc)

    history = [x for x in train]

    for i in range(len(test)):
        test_X, test_y = test[i, :-1], test[i, -1]

        pred = xgb_predict(history, test_X[0])
        predictions.append(pred)

        history.append(test[i])

    error = mean_squared_error(test[:, -1], predictions, squared=False)

    return error, test[:, -1], predictions #error, original results, predictions

In [49]:
%%time
rmse, y, pred = validate(df, 0.2) #20% for test set

print(rmse)



45.32960725092994
Wall time: 15.7 s


In [50]:
# we need to minimize RMSE, by tuning the parameters, or we change to other models.