In [56]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
# Import data set
df = pd.read_csv('AAPL.csv')

In [58]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,07/12/2020,122.309998,124.570000,122.250000,123.750000,122.991783,86712000
1,08/12/2020,124.370003,124.980003,123.089996,124.379997,123.617920,82225500
2,09/12/2020,124.529999,125.949997,121.000000,121.779999,121.033844,115089200
3,10/12/2020,120.500000,123.870003,120.150002,123.239998,122.484901,81312200
4,11/12/2020,122.430000,122.760002,120.550003,122.410004,121.659996,86939800
...,...,...,...,...,...,...,...
247,30/11/2021,159.990005,165.520004,159.919998,165.300003,165.300003,174048100
248,01/12/2021,167.479996,170.300003,164.529999,164.770004,164.770004,152052500
249,02/12/2021,158.740005,164.199997,157.800003,163.759995,163.759995,136739200
250,03/12/2021,164.020004,164.960007,159.720001,161.839996,161.839996,117938300


In [59]:
# Exploratory data analysis
### Type of each variable
df.dtypes


Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [60]:
### Shape of the data
df.shape

(252, 7)

In [61]:
### NAN
df.isna().sum() # By default it is axis = 0; it will sum by columns --> for each variable rather than each observation

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## Create the labels column
In order to create the labels column, we will take the variable "Close" (Market closing price of AAPL) and shift it 5 rows up. The result will be an array that has the last 5 rows as NAN and the rows 0 to 4 are technically the rows 5 to 9 of our data set.

In [62]:
### Create label column

## Choose the number of steps 
steps = 5 # Can be changed

## Create the labels array
labels = df['Close'].shift(-steps)


labels

0      121.779999
1      127.879997
2      127.809998
3      128.699997
4      126.660004
          ...    
247           NaN
248           NaN
249           NaN
250           NaN
251           NaN
Name: Close, Length: 252, dtype: float64

## Create the feature array (X)
The feature array is the same "Close" variable without the last 5 rows.

In [63]:
### Create the feature array
X = np.array(df[['Close']])

In [64]:
### Scale the X array
from sklearn.preprocessing import scale
X = scale(X)


In [65]:
# Create the column that will be used in the predict method. This would be the last 5 rows that we removed from the X.
X_lately = X[-steps:]

In [66]:
# The final X
X = X[:-steps]


## Create the y vector from the labels vector
It is the same vector but without the NAN.

In [67]:
# label vector

labels.dropna(inplace = True)
y = np.array(labels)


## Summary 
So what we have now is:
* A **y** vector, non-standardized that contains all the closing prices of apple except the first 5 days.
* An **X** vector, standardized that contains all the closing price of apple except the last 5 days.
* The **y** and the **X** vectors have the same size.
* The idea is that the price during day n will predict the price during day n+5. (the 5th day).  
  
* EX: original_vector = 122;124;121;125;123;122;123;125;127.
    * If steps = 3
    * y = 125;123;122;123;125;127
    * X = 122;124;121;125;123;122;
    * Values to be predicted : last 3 : 123,125,127

## Training and testing split

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

## Training the LR model

In [69]:
from sklearn.linear_model import LinearRegression

In [70]:
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

## Quality of the model

In [71]:
score = lr.score(X_test,y_test)

In [72]:
forecast = lr.predict(X_lately)

In [73]:
response = {}
response['test_score'] = score
response['forcast_set'] = forecast
print(response)

{'test_score': 0.8575041745586153, 'forcast_set': array([164.73467862, 164.23403236, 163.2799603 , 161.46629588,
       165.2022595 ])}


In [74]:
# https://thecleverprogrammer.com/2020/11/14/stock-price-prediction-using-machine-learning/#google_vignette