In [1]:
import pandas as pd

columns = ['Prev Close','Open','High','Low','Last','Close','VWAP','Volume','Month']
label = 'High'
stock = pd.read_csv('DRREDDY_train.csv')
stock['Month'] = pd.DatetimeIndex(stock['Date']).month
stock = stock[columns]
stock.info()
stock.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5163 entries, 0 to 5162
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Prev Close  5163 non-null   float64
 1   Open        5163 non-null   float64
 2   High        5163 non-null   float64
 3   Low         5163 non-null   float64
 4   Last        5163 non-null   float64
 5   Close       5163 non-null   float64
 6   VWAP        5163 non-null   float64
 7   Volume      5163 non-null   int64  
 8   Month       5163 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 363.1 KB


Unnamed: 0,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Month
count,5163.0,5163.0,5163.0,5163.0,5163.0,5163.0,5163.0,5163.0,5163.0
mean,1743.353322,1745.428975,1769.765214,1719.800475,1743.919194,1744.087643,1745.193287,414316.6,6.455355
std,980.627609,982.404452,992.503187,972.041822,981.633537,981.786655,982.209638,576181.4,3.424858
min,374.0,365.65,384.95,355.25,375.15,374.0,372.55,707.0,1.0
25%,899.25,899.0,910.0,883.1,900.0,899.25,899.045,163534.5,3.0
50%,1496.0,1500.0,1523.85,1472.95,1496.0,1496.0,1497.99,288141.0,6.0
75%,2550.65,2550.0,2582.775,2518.25,2548.675,2551.2,2550.07,497045.0,9.0
max,5333.35,5333.35,5512.65,5111.25,5306.0,5333.35,5287.28,21987150.0,12.0


In [2]:
# %matplotlib inline
# import matplotlib as plt
# stock.hist(bins=50, figsize=(20, 15))

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set= train_test_split(stock, test_size= 0.2, random_state= 42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")
stock_train_set = train_set.drop(label, axis=1)
stock_train_labels = train_set[label].copy()
stock_test_set = test_set.drop(label, axis=1)
stock_test_labels = test_set[label].copy()

Rows in train set: 4130
Rows in test set: 1033



## Correlation

In [4]:
corr_matrix = train_set.corr()
corr_matrix[label].sort_values(ascending=False)

High          1.000000
VWAP          0.999791
Close         0.999648
Last          0.999629
Open          0.999561
Low           0.999288
Prev Close    0.999272
Volume        0.283956
Month         0.014808
Name: High, dtype: float64

In [5]:
# stock.plot(kind="scatter", x=label, y="Last", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="VWAP", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="Close", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="Low", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="Open", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="Prev Close", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="Volume", alpha=0.8)
# stock.plot(kind="scatter", x=label, y="Month", alpha=0.8)

## Create Pipeline

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scalar', StandardScaler()),
])

In [7]:
stock_num_train = my_pipeline.fit_transform(stock_train_set)
stock_num_train

array([[ 0.76803952,  0.74331219,  0.66597298, ...,  0.69894463,
         0.65302548,  1.05331505],
       [ 0.95102364,  0.96755164,  0.99888461, ...,  0.98514258,
         0.84006167,  1.63919938],
       [ 1.33828103,  1.3341166 ,  1.37595324, ...,  1.34937216,
        -0.33880151,  1.05331505],
       ...,
       [-0.08275712, -0.07992308, -0.06937295, ..., -0.08783592,
        -0.29552084, -0.41139578],
       [ 1.65212931,  1.61160013,  1.6514556 , ...,  1.63872538,
        -0.34659139, -1.29022227],
       [-0.80720391, -0.81202907, -0.80125052, ..., -0.80826346,
        -0.61763657, -0.11845361]])

## Selecting a desired model

In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(stock_num_train, stock_train_labels)

LinearRegression()

In [9]:
some_data = stock_test_set.iloc[:5]
some_labels = stock_test_labels.iloc[:5]
prepared_data = my_pipeline.transform(some_data)
print(some_data)
print('=====PREDICTED=====')
print(model.predict(prepared_data))
print('=====ACTUAL=====')
print(list(some_labels))

import numpy as np
from sklearn.metrics import mean_squared_error
stock_predictions = model.predict(my_pipeline.transform(stock_test_set))
mse = mean_squared_error(stock_test_labels, stock_predictions)
rmse = np.sqrt(mse)
print('=====RMSE=====')
print(rmse)

      Prev Close     Open      Low     Last    Close     VWAP  Volume  Month
1726      790.80   790.00   775.10   781.05   779.90   786.79  289516     11
1666     1495.70  1502.90  1462.55  1474.00  1474.20  1476.95  193231      8
4230     2998.55  2990.00  2979.10  2983.95  2986.00  3002.75  106615     12
1181      756.10   758.00   746.00   749.20   750.35   757.32  415561      9
3129     1657.95  1661.25  1633.85  1646.80  1644.75  1648.31  293174      7
=====PREDICTED=====
[ 800.70121138 1502.85181181 3023.50747987  770.43935113 1671.14380494]
=====ACTUAL=====
[796.9, 1502.9, 3028.0, 765.0, 1663.9]
=====RMSE=====
10.59015830606504


## Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, stock_num_train, stock_train_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [11]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [12]:
print_scores(rmse_scores)

Scores: [17.3696461  12.01794196 22.69148155 11.58301361 11.43475673 11.36863468
 11.89515773  9.91984593 17.17113606 11.50240711]
Mean:  13.695402148139985
Standard deviation:  3.829041576764112


## Saving Model

In [13]:
from joblib import dump, load
dump(model, 'LinerReg.joblib') 

['LinerReg.joblib']

## Using Model

In [14]:
from joblib import dump, load
import numpy as np
model = load('LinerReg.joblib')
'''
['Prev Close','Open','Low','Last','Close','VWAP','Volume','Month']
'''
features = my_pipeline.transform([[2489.55,2469.0,2360.25,2365.05,2372.45,2424.61,804214,10]])
model.predict(features)

array([2489.761597])