# Personalized Stock Prediction App

In [6]:
# might need updata yfinance to the latest to scrape the data successfully
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting scikeras (from -r requirements.txt (line 9))
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
#Import relevent packages

import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import tensorflow as tf
import sklearn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasRegressor


ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'

# Getting Data through API 

In [None]:
#Define the stock ticker and pull the historical data from a start date
ticker=yf.Ticker('AAPL')
start=dt.datetime(2011, 1, 1)
end=dt.datetime.now()

data=ticker.history(start=start, end=end, interval="1d")
print('The shape of the data is ', data.shape)
data.tail()

# Data Preprocess and Visualization

In [None]:
#Select the daily closing price and normalize the data to (0,1)

data_close=data.Close.values.reshape(-1,1)
scaler=MinMaxScaler(feature_range=(0,1))
scaled_data=scaler.fit_transform(data_close)
scaled_data

In [None]:
#Visualize the historical data

plt.figure(figsize=(16, 8))
plt.title('History Price of Apple')
plt.plot(data.Close)
plt.xlabel('Date')
plt.ylabel('Price $')
plt.show()

# Prepare Data for Different Training Models

In [26]:

def train_test_split_lstm(data, prediction_days, train_size=0.9):
    
    '''Split train and test data for RNN model
    
    Split train and test data according to the ratio in time order
    
    Args:
        data is the preprocessed historical data
        prediction_days is the number of historical data to use as input
        train_size is the ratio of training set
        
    Returns:
        input and label for training set and test set
        
    '''
    
    n_train=int(len(data)*train_size)
    x_train, y_train, x_test, y_test=[], [], [], []
    
    for i in range(prediction_days, n_train):
        x_train.append(data[i-prediction_days:i, 0])
        y_train.append(data[i, 0])
    
    for j in range(n_train, len(data)):
        x_test.append(data[j-prediction_days:j, 0])
        y_test.append(data[j, 0])
    
    x_train, y_train=np.array(x_train), np.array(y_train)
    x_test, y_test=np.array(x_test), np.array(y_test)
    x_train=np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test=np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    
    return x_train, y_train, x_test, y_test
    

In [None]:
# Train test split for RNN model

prediction_days=60 # number of days used as input

x_train, y_train, x_test, y_test=train_test_split_lstm(scaled_data, prediction_days)

print('x_train has {} samples, x_test has {} samples'.format(x_train.shape[0], x_test.shape[0]))

In [28]:
# Train test split for regression model

# number of historical data as input
num_days=252
# predict the stock price of future days
future_days=1
# get historical data
data_reg=data_close[-num_days:]
X=data_reg[:-future_days]
y=data_reg[future_days:]

# train test split and future data, set shuffle to false
x_train1, x_test1, y_train1, y_test1=train_test_split(X, y, test_size=0.2, shuffle=False)
x_future=X[-future_days:]

# Regression Model

Build regression model using the data from the previous year from now. 

In [None]:
# DecisionTree, LinearRegression, SVM model
dt=DecisionTreeRegressor().fit(x_train1, y_train1)
lr=LinearRegression().fit(x_train1, y_train1)
svm=SVR(kernel='poly').fit(x_train1, y_train1.ravel())

# evaluate the performance of the model on training set
print('For training set: ')
print('Linear      : mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train1, lr.predict(x_train1)),\
                                                                     r2_score(y_train1, lr.predict(x_train1))))
print('DecisionTree: mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train1, dt.predict(x_train1)),\
                                                                     r2_score(y_train1, dt.predict(x_train1))))
print('SVM         : mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train1, svm.predict(x_train1)),\
                                                                     r2_score(y_train1, svm.predict(x_train1))))

# make predictions on the testset
dt_pred=dt.predict(x_test1)
lr_pred=lr.predict(x_test1)
svm_pred=svm.predict(x_test1)

# evaluate the performance of the model on test set
print('For test set: ')
print('Linear      : mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test1, lr_pred),\
                                                                     r2_score(y_test1, lr_pred)))
print('DecisionTree: mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test1, dt_pred),\
                                                                     r2_score(y_test1, dt_pred)))
print('SVM         : mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test1, svm_pred),\
                                                                     r2_score(y_test1, svm_pred)))


**Display the test prediction (recent stock price)**

In [None]:
data_future=data[-len(x_test1):]
data_future['dt prediction']=dt_pred
data_future['lr prediction']=lr_pred
data_future['svm prediction']=svm_pred
data_future=data_future[['Close', 'dt prediction', 'lr prediction', 'svm prediction']]
data_future.tail()

In [None]:
plt.figure(figsize=(16, 12))
plt.subplot(2, 1, 1)
plt.title('Regression Model')
plt.plot(data.Close[-num_days:])
plt.plot(data_future['dt prediction'])
plt.plot(data_future['lr prediction'])
plt.plot(data_future['svm prediction'])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Actual Price', 'DecisionTree Prediction', 'Linear Prediction', 'SVM Prediction'])

plt.subplot(2, 1, 2)
plt.title('Recent 2 months')
plt.plot(data.Close[-len(y_test1):])
plt.plot(data_future['dt prediction'])
plt.plot(data_future['lr prediction'])
plt.plot(data_future['svm prediction'])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Actual Price', 'DecisionTree Prediction', 'Linear Prediction', 'SVM Prediction'])
plt.show()

### Gridsearch to optimize parameters

In [None]:
# Optimize parameters for DecisionTree Regression

model=DecisionTreeRegressor()
parameters={'max_depth': (None, 2, 3, 5, 8), #8
            'criterion': ('mse', 'mae'),     #mse
            'splitter': ('best', 'random')   #random
           }

cv_dt=GridSearchCV(model, param_grid=parameters,verbose=3)
cv_dt.fit(x_train1, y_train1)
cv_dt_pred=cv_dt.predict(x_test1)

print('The best parameter set: ', cv_dt.best_params_)
print('Trainset: mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train1, cv_dt.predict(x_train1)),\
                                                                     r2_score(y_train1, cv_dt.predict(x_train1))))
print('Testset: mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test1, cv_dt_pred),\
                                                                     r2_score(y_test1, cv_dt_pred)))

In [None]:
# Optimize parameters for SVM Regression
model=SVR()
print(model.get_params())
parameters={'kernel': ('poly', 'rbf', 'sigmoid'),       #poly
            'C': (0.0001, 0.001, 0.01, 0.1, 1.0)        #C=1.0 
           }

cv_svm=GridSearchCV(model, param_grid=parameters, verbose=3)
cv_svm.fit(x_train1, y_train1.ravel())
cv_svm_pred=cv_svm.predict(x_test1)

print('The best parameter set: ', cv_svm.best_params_)
print('Trainset: mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train1, cv_svm.predict(x_train1)),\
                                                                     r2_score(y_train1, cv_svm.predict(x_train1))))
print('Testset: mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test1, cv_svm_pred),\
                                                                     r2_score(y_test1, cv_svm_pred)))

### Final prediction for the next day stock price

In [None]:
# Forecast the stock price for the next day

x_next_day=data_reg[-future_days].reshape(-1,1)
lr_next_day=lr.predict(x_next_day)
dt_next_day=cv_dt.predict(x_next_day)
svm_next_day=cv_svm.predict(x_next_day)
print('Next day prediction by Linear model is ', lr_next_day)
print('Next day prediction by DecisionTree model is ', dt_next_day)
print('Next day prediction by SVM model is', svm_next_day)


### Final model assessment

In [None]:
lr_rmse=np.sqrt(mean_squared_error(y_test1, lr_pred))
dt_rmse=np.sqrt(mean_squared_error(y_test1, cv_dt_pred))
svm_rmse=np.sqrt(mean_squared_error(y_test1, cv_svm_pred))
average_price=np.mean(y_test1)
lr_percent=np.round(lr_rmse/average_price*100.0, 2)
dt_percent=np.round(dt_rmse/average_price*100.0, 2)
svm_percent=np.round(svm_rmse/average_price*100.0, 2)
print('Predicted value is within {}% of the actual value on average for Linear Regression.'.format(lr_percent))
print('Predicted value is within {}% of the actual value on average for DecisionTree Regression.'.format(dt_percent))
print('Predicted value is within {}% of the actual value on average for SVM Regression.'.format(svm_percent))

# RNN model

In [36]:
# LSTM model parameters

lstm_size=64
input_size=(x_train.shape[1], x_train.shape[2])
dropout=0.2
dense_size=10

In [None]:
# build LSTM model and train the model for 20 epochs

def create_model(lstm_size=64, dropout=0.2, dense_size=10):
    
    model=Sequential()
    model.add(LSTM(units=lstm_size, return_sequences=True, input_shape=input_size))
    model.add(Dropout(dropout))
    model.add(LSTM(units=lstm_size, return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(units=dense_size))
    model.add(Dense(units=1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

model=create_model(lstm_size, dropout, dense_size)
model.summary()
model.fit(x_train, y_train, epochs=20, batch_size=64, validation_data=(x_test, y_test))

In [None]:
y_pred=model.predict(x_test)

y_fit=model.predict(x_train)
y_fit_price=scaler.inverse_transform(y_fit)
y_train_price=scaler.inverse_transform(y_train.reshape(-1,1))
y_test_price=scaler.inverse_transform(y_test.reshape(-1,1))
y_pred_price=scaler.inverse_transform(y_pred)

print('For training set: ')
print('mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train_price, y_fit_price),\
                                                       r2_score(y_train_price, y_fit_price)))
print('For test set: ')
print('mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test_price, y_pred_price),\
                                                       r2_score(y_test_price, y_pred_price)))


In [None]:
# final assessment
lstm_rmse=np.sqrt(mean_squared_error(y_test_price, y_pred_price))
average_price=np.mean(y_test_price)
lstm_percent=np.round(lstm_rmse/average_price*100.0, 2)

print('Predicted value is within {}% of the actual value on average for LSTM.'.format(lstm_percent))


#### This Cell below takes long time to run. Try reduce the number of variables in the parameters dict

In [None]:
# Optimize parameters for LSTM model
lstm=KerasRegressor(build_fn=create_model, epochs=20, batch_size=64, verbose=0)
parameters = {'lstm_size': ([20, 32, 64]) , #64 (20, 32, 64)
              'dropout': ([0.2, 0.3, 0.5]), #0.2 (0.2, 0.3, 0.5)
              'dense_size': (50, 20, 10)} #10

cv_lstm=GridSearchCV(lstm, parameters, verbose=3)
cv_lstm.fit(x_train, y_train)
cv_lstm_pred=cv_lstm.predict(x_test).reshape(-1,1)

y_pred_price=scaler.inverse_transform(cv_lstm_pred)

y_fit=cv_lstm.predict(x_train).reshape(-1,1)
y_fit_price=scaler.inverse_transform(y_fit)
y_train_price=scaler.inverse_transform(y_train.reshape(-1,1))
y_test_price=scaler.inverse_transform(y_test.reshape(-1,1))


print('The best parameter set: ', cv_lstm.best_params_)
print('For training set: ')
print('mean square error is {}, r2_score is {}'.format(mean_squared_error(y_train_price, y_fit_price),\
                                                       r2_score(y_train_price, y_fit_price)))
print('For test set: ')
print('mean square error is {}, r2_score is {}'.format(mean_squared_error(y_test_price, y_pred_price),\
                                                       r2_score(y_test_price, y_pred_price)))



**LSTM performance**

In [None]:
# Visualize predicted price and actual price
n_train=x_train.shape[0]+prediction_days
train=data.iloc[:n_train, :]
valid=data.iloc[n_train:, :]
valid['Prediction']=y_pred_price

plt.figure(figsize=(16, 12))
plt.subplot(2,1,1)
plt.title('LSTM model for Apple stock Prediction')
plt.plot(train['Close'], color='blue')
plt.plot(valid[['Close', 'Prediction']])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Train', 'Val', 'Prediction'])

plt.subplot(2,1,2)
plt.title('Recent Stock Validation')
plt.plot(valid[['Close', 'Prediction']])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend([ 'Val', 'Prediction'])
plt.show()

**Plot recent stock prediction and compare with actual price**

In [None]:
# Compare all models on the test set
plt.figure(figsize=(16, 12))

plt.subplot(2,2,1)
plt.title('Apple Stock Predicted by Linear Regression')
plt.plot(data.Close[-len(y_test1):])
plt.plot(data_future['lr prediction'])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Actual Price', 'Linear Prediction'])

plt.subplot(2,2,2)
plt.title('Apple Stock Predicted by DecisionTree')
plt.plot(data.Close[-len(y_test1):])
plt.plot(data_future['dt prediction'])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Actual Price', 'DecisionTree Prediction'])

plt.subplot(2,2,3)
plt.title('Apple Stock Predicted by SVM')
plt.plot(data.Close[-len(y_test1):])
plt.plot(data_future['svm prediction'])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Actual Price', 'SVM Prediction'])

plt.subplot(2,2,4)
plt.plot(data.Close[-len(y_test1):])
plt.plot(valid['Prediction'][-len(y_test1):])
plt.xlabel('Date')
plt.ylabel('Closing Price $')
plt.legend(['Actual Price', 'LSTM Prediction'])

plt.show()


**LSTM next day prediction**

In [42]:
next_day=[scaled_data[len(scaled_data)-prediction_days:,0]]
next_day=np.array(next_day)
next_day=np.reshape(next_day, (next_day.shape[0], next_day.shape[1], 1))

In [None]:
pred_next_day=model.predict(next_day)
pred_next_day=scaler.inverse_transform(pred_next_day)
print('The prediction for next day is ', pred_next_day[0][0])

# Time series cross validation for LSTM

In [60]:
# split data for cross validation

def rolling_forecast(x_train, y_train, min_train_size=1500, horizon=200):
    '''
    Create k-fold cross validation
    
    Args:
        x_train (numpy array): training data input
        y_train (numpy array): training data output
        min_train_size (int): min training data size
        horizon (int)：validation size of future data
    
    Returns:
        yield a data generator
    '''
    for i in range(0, train.shape[0]-min_train_size-horizon, horizon):
        split_x_train, split_y_train=x_train[:min_train_size+i, :, :], y_train[:min_train_size+i]
        split_x_val, split_y_val=x_train[min_train_size+i:min_train_size+i+horizon , :, :], \
                                 y_train[min_train_size+i:min_train_size+i+horizon]
        yield split_x_train, split_y_train, split_x_val, split_y_val

In [None]:
# cross validation

cv=rolling_forecast(x_train, y_train)
model_cv=create_model(lstm_size, dropout, dense_size)
cv_mse, cv_percentage=[], []

for cv_x_train, cv_y_train, cv_x_val, cv_y_val in cv:
    model_cv.fit(cv_x_train, cv_y_train, epochs=20, batch_size=64, verbose=0)
    preds=model_cv.predict(cv_x_val)
    
    y_val_price=scaler.inverse_transform(cv_y_val.reshape(-1,1))
    y_pred_price=scaler.inverse_transform(preds.reshape(-1,1))
    
    mse=mean_squared_error(y_val_price, y_pred_price)
    percentage=np.sqrt(mse)/np.mean(y_val_price)*100.0
    print('mse is {}, prediction within {}%'.format(mse, percentage))
    
    cv_mse.append(mse)
    cv_percentage.append(percentage)
