In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
df= pd.read_csv("WIKI-PRICES.csv")

In [3]:
print(df)

     ticker        date   open   high      low  close    volume  ex-dividend  \
0      ZUMZ  2018-03-27  24.65  24.65  23.3500  23.60  403884.0          0.0   
1      ZUMZ  2018-03-26  23.75  24.80  23.7000  24.65  375320.0          0.0   
2      ZUMZ  2018-03-23  23.55  24.20  23.4500  23.55  301584.0          0.0   
3      ZUMZ  2018-03-22  23.90  24.35  23.3000  23.35  269607.0          0.0   
4      ZUMZ  2018-03-21  23.80  24.60  23.6058  23.95  354092.0          0.0   
...     ...         ...    ...    ...      ...    ...       ...          ...   
9995    ZQK  1994-01-10  14.62  14.62  13.7500  13.75   78000.0          0.0   
9996    ZQK  1994-01-07  14.88  14.88  14.2500  14.37   19600.0          0.0   
9997    ZQK  1994-01-06  15.75  15.75  14.5000  14.50   97000.0          0.0   
9998    ZQK  1994-01-05  15.13  15.75  14.7500  15.37  139900.0          0.0   
9999    ZQK  1994-01-04  14.88  15.13  14.5000  15.00  146000.0          0.0   

      split_ratio   adj_open   adj_high

In [4]:
df.head(5000)

Unnamed: 0,ticker,date,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume
0,ZUMZ,2018-03-27,24.65,24.65,23.3500,23.60,403884.0,0.0,1.0,24.65,24.65,23.3500,23.60,403884.0
1,ZUMZ,2018-03-26,23.75,24.80,23.7000,24.65,375320.0,0.0,1.0,23.75,24.80,23.7000,24.65,375320.0
2,ZUMZ,2018-03-23,23.55,24.20,23.4500,23.55,301584.0,0.0,1.0,23.55,24.20,23.4500,23.55,301584.0
3,ZUMZ,2018-03-22,23.90,24.35,23.3000,23.35,269607.0,0.0,1.0,23.90,24.35,23.3000,23.35,269607.0
4,ZUMZ,2018-03-21,23.80,24.60,23.6058,23.95,354092.0,0.0,1.0,23.80,24.60,23.6058,23.95,354092.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,ZQK,2013-11-15,8.98,9.07,8.8800,9.01,1137300.0,0.0,1.0,8.98,9.07,8.8800,9.01,1137300.0
4996,ZQK,2013-11-14,9.12,9.12,8.8900,8.99,1608800.0,0.0,1.0,9.12,9.12,8.8900,8.99,1608800.0
4997,ZQK,2013-11-13,8.64,9.29,8.6300,9.16,2747300.0,0.0,1.0,8.64,9.29,8.6300,9.16,2747300.0
4998,ZQK,2013-11-12,8.74,8.75,8.5150,8.67,878700.0,0.0,1.0,8.74,8.75,8.5150,8.67,878700.0


In [5]:
df = df[['adj_open' , 'adj_high' , 'adj_low' , 'adj_close' , 'adj_volume']]

In [6]:
df.head()

Unnamed: 0,adj_open,adj_high,adj_low,adj_close,adj_volume
0,24.65,24.65,23.35,23.6,403884.0
1,23.75,24.8,23.7,24.65,375320.0
2,23.55,24.2,23.45,23.55,301584.0
3,23.9,24.35,23.3,23.35,269607.0
4,23.8,24.6,23.6058,23.95,354092.0


In [7]:
df['HL_PCT'] = (df['adj_high'] - df['adj_close'] )/df['adj_close']*100   #high low percentage change

In [8]:
df['PCT_change'] = (df['adj_close'] - df['adj_open'])/df['adj_open'] * 100   #percentage change

In [9]:
df.head()

Unnamed: 0,adj_open,adj_high,adj_low,adj_close,adj_volume,HL_PCT,PCT_change
0,24.65,24.65,23.35,23.6,403884.0,4.449153,-4.259635
1,23.75,24.8,23.7,24.65,375320.0,0.608519,3.789474
2,23.55,24.2,23.45,23.55,301584.0,2.760085,0.0
3,23.9,24.35,23.3,23.35,269607.0,4.282655,-2.301255
4,23.8,24.6,23.6058,23.95,354092.0,2.713987,0.630252


In [10]:
df= df[['adj_close','HL_PCT','PCT_change','adj_volume']]    #actually we care about these column in our dataframe
                                                            #feature

In [11]:
df.head()

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume
0,23.6,4.449153,-4.259635,403884.0
1,24.65,0.608519,3.789474,375320.0
2,23.55,2.760085,0.0,301584.0
3,23.35,4.282655,-2.301255,269607.0
4,23.95,2.713987,0.630252,354092.0


In [12]:
forecast_col = 'adj_close'

In [13]:
forecast_col

'adj_close'

In [14]:
df.fillna(0,inplace=True)

In [15]:
import math

In [16]:
forecast_out = int(math.ceil(0.0001*len(df)))    #

In [17]:
forecast_out

1

In [18]:
df['label'] = df[forecast_col].shift(-forecast_out,axis = 0)  #label :10days in future approx may be greater or less

In [19]:
df.head()

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume,label
0,23.6,4.449153,-4.259635,403884.0,24.65
1,24.65,0.608519,3.789474,375320.0,23.55
2,23.55,2.760085,0.0,301584.0,23.35
3,23.35,4.282655,-2.301255,269607.0,23.95
4,23.95,2.713987,0.630252,354092.0,23.8


In [20]:
df.dropna(inplace=True)

In [21]:
df.tail()

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume,label
9994,1.135,3.671072,-3.541076,726000.0,1.145833
9995,1.145833,6.327273,-5.950752,936000.0,1.1975
9996,1.1975,3.549061,-3.427419,235200.0,1.208333
9997,1.208333,8.62069,-7.936508,1164000.0,1.280833
9998,1.280833,2.472349,1.586252,1678800.0,1.25
