## Importing Required Libraries

In [1]:
from nsepy import get_history
from datetime import date
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Getting TCS stocks from NSE API

In [2]:
tcs_data = get_history(symbol='TCS',start =date(2015,1,1),end =date(2015,12,31))
tcs_data.head()

Unnamed: 0_level_0,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-01-01,TCS,EQ,2558.25,2567.0,2567.0,2541.0,2550.0,2545.55,2548.51,183415,46743450000000.0,8002,52870,0.2883
2015-01-02,TCS,EQ,2545.55,2551.0,2590.95,2550.6,2588.4,2579.45,2568.19,462870,118874000000000.0,27585,309350,0.6683
2015-01-05,TCS,EQ,2579.45,2581.0,2599.9,2524.65,2538.1,2540.25,2563.94,877121,224888600000000.0,43234,456728,0.5207
2015-01-06,TCS,EQ,2540.25,2529.1,2529.1,2440.0,2450.05,2446.6,2466.9,1211892,298961500000000.0,84503,714306,0.5894
2015-01-07,TCS,EQ,2446.6,2470.0,2479.15,2407.45,2426.9,2417.7,2433.96,1318166,320836200000000.0,101741,886368,0.6724


In [3]:
tcs_data = tcs_data[['Close','Volume']]

tcs_data.reset_index(level = 0,inplace=True)
tcs_data['Date'] = pd.to_datetime(tcs_data['Date']) 

tcs_data.to_csv("stock_tcs.csv")

tcs_data.describe(include='all')

Unnamed: 0,Date,Close,Volume
count,248,248.0,248.0
unique,248,,
top,2015-08-19 00:00:00,,
freq,1,,
first,2015-01-01 00:00:00,,
last,2015-12-31 00:00:00,,
mean,,2537.717944,1172296.0
std,,87.057814,622063.5
min,,2319.8,67582.0
25%,,2495.15,782135.2


## Data Visualization through Bokeh

In [4]:
from bokeh.plotting import figure
from bokeh.io import show, output_notebook

In [5]:
fig = figure(plot_width= 900,plot_height=600, title = "TCS Stock", x_axis_label='X',y_axis_label='Y',x_axis_type = 'datetime')

fig.line( tcs_data['Date'],tcs_data['Close'], color='blue', line_width= 2, alpha=1)

output_notebook()
show(fig)

## Getting working days and Assign it to our dataframe

In [6]:
weekdays = tcs_data['Date'].dt.weekday
tcs_data = tcs_data.assign(weekdays =weekdays)
tcs_data.head()

Unnamed: 0,Date,Close,Volume,weekdays
0,2015-01-01,2545.55,183415,3
1,2015-01-02,2579.45,462870,4
2,2015-01-05,2540.25,877121,0
3,2015-01-06,2446.6,1211892,1
4,2015-01-07,2417.7,1318166,2


## Work done on different weekdays

In [7]:
weekday_counts = tcs_data.groupby('weekdays').count()
weekday_counts

Unnamed: 0_level_0,Date,Close,Volume
weekdays,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,51,51,51
1,50,50,50
2,51,51,51
3,49,49,49
4,46,46,46
5,1,1,1


This data clearly shows that Monday to Friday are the working days with a few holidays. An exception is there where work has been done on Saturday. So, we would like to handle unequal time series due to the weekends.

## Handling weekends in timeseries

In [8]:
tcs_data = tcs_data.set_index('Date')
tcs_data.head()

Unnamed: 0_level_0,Close,Volume,weekdays
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-01,2545.55,183415,3
2015-01-02,2579.45,462870,4
2015-01-05,2540.25,877121,0
2015-01-06,2446.6,1211892,1
2015-01-07,2417.7,1318166,2


In [9]:
all_days = pd.date_range(tcs_data.index[0],tcs_data.index[-1],freq='D')
all_days

DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
               '2015-01-09', '2015-01-10',
               ...
               '2015-12-22', '2015-12-23', '2015-12-24', '2015-12-25',
               '2015-12-26', '2015-12-27', '2015-12-28', '2015-12-29',
               '2015-12-30', '2015-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [10]:
new_index = all_days.union(tcs_data.index)
new_index

DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
               '2015-01-09', '2015-01-10',
               ...
               '2015-12-22', '2015-12-23', '2015-12-24', '2015-12-25',
               '2015-12-26', '2015-12-27', '2015-12-28', '2015-12-29',
               '2015-12-30', '2015-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [11]:
tcs_data = tcs_data.reindex(new_index)
tcs_data.head()

Unnamed: 0,Close,Volume,weekdays
2015-01-01,2545.55,183415.0,3.0
2015-01-02,2579.45,462870.0,4.0
2015-01-03,,,
2015-01-04,,,
2015-01-05,2540.25,877121.0,0.0


## Autofilling NaNs using Interpolation

In [12]:
tcs_data[['Close','Volume']] = tcs_data[['Close','Volume']].interpolate(method = "linear")
tcs_data.drop('weekdays',axis=1,inplace=True)
tcs_data.head(10)

Unnamed: 0,Close,Volume
2015-01-01,2545.55,183415.0
2015-01-02,2579.45,462870.0
2015-01-03,2566.383333,600953.7
2015-01-04,2553.316667,739037.3
2015-01-05,2540.25,877121.0
2015-01-06,2446.6,1211892.0
2015-01-07,2417.7,1318166.0
2015-01-08,2443.8,782704.0
2015-01-09,2512.3,1598821.0
2015-01-10,2511.433333,1331882.0


## Calculating moving Average on Weekly basis

In [13]:
weekly_data = tcs_data.resample("W").mean()
weekly_data.head()

Unnamed: 0,Close,Volume
2015-01-04,2561.175,496569.0
2015-01-11,2483.235714,1169361.0
2015-01-18,2520.564286,1541728.0
2015-01-25,2506.919643,1197124.0
2015-02-01,2509.576786,1946669.0


Moving Average for TCS stock at the end of every 4 weeks, 16 weeks, 28 weeks, 40 weeks and 52 weeks

In [14]:
def weekly_moving_average():
    for x in range(4,53,12):
        weekly_data[str(x) + 'weeks'] =weekly_data['Close'].rolling(x).mean()
    return weekly_data

weekly_data = weekly_moving_average()
weekly_data.head()

Unnamed: 0,Close,Volume,4weeks,16weeks,28weeks,40weeks,52weeks
2015-01-04,2561.175,496569.0,,,,,
2015-01-11,2483.235714,1169361.0,,,,,
2015-01-18,2520.564286,1541728.0,,,,,
2015-01-25,2506.919643,1197124.0,2517.973661,,,,
2015-02-01,2509.576786,1946669.0,2505.074107,,,,


## Calculating Volume Shocks 

In [15]:
tcs_data['Volume_Shock'] = (abs(tcs_data['Volume'].pct_change()) > 0.1).astype(int)

tcs_data['Volume_Shock_Dir'] = np.nan
tcs_data.loc[(tcs_data['Volume'].pct_change() > 0.1),'Volume_Shock_Dir'] = 1
tcs_data.loc[(tcs_data['Volume'].pct_change() < -0.1),'Volume_Shock_Dir'] = 0

Here, In **Volume_Shock** column, 1 is used to show that shock has occured and 0 for no occurence.<br />
In **Volume_Shock_Dir** column, 1 is used to show that direction of shock is positive, i.e, volume has increased by 10% or higher and 0 for negative direction, i.e, volume has decreased by 10% or higher and **NaN** for no shock

## Calculating Price Shocks

In [16]:
tcs_data['Price_Shock'] = (abs(tcs_data['Close'].pct_change(-1)) > 0.02).astype(int)

tcs_data['Price_Shock_Dir'] = np.nan
tcs_data.loc[(tcs_data['Close'].pct_change() > 0.02),'Price_Shock_Dir'] = 1
tcs_data.loc[(tcs_data['Close'].pct_change() < -0.02),'Price_Shock_Dir'] = 0

Here, In **Price_Shock** column, 1 is used to show that shock has occured and 0 for no occurence.<br />
In **Price_Shock_Dir** column, 1 is used to show that direction of shock is positive, i.e,price has increased by 2% or higher and 0 for negative direction, i.e,price has decreased by 2% or higher and **NaN** for no shock

## Calculating Price Shock without Volume Shock

In [17]:
tcs_data['Price_shock_without_volume_shock'] = 0
tcs_data.loc[(tcs_data['Price_Shock']==1) & (tcs_data['Volume_Shock']==0),'Price_shock_without_volume_shock'] = 1

tcs_data.head(10)

Unnamed: 0,Close,Volume,Volume_Shock,Volume_Shock_Dir,Price_Shock,Price_Shock_Dir,Price_shock_without_volume_shock
2015-01-01,2545.55,183415.0,0,,0,,0
2015-01-02,2579.45,462870.0,1,1.0,0,,0
2015-01-03,2566.383333,600953.7,1,1.0,0,,0
2015-01-04,2553.316667,739037.3,1,1.0,0,,0
2015-01-05,2540.25,877121.0,1,1.0,1,,0
2015-01-06,2446.6,1211892.0,1,1.0,0,0.0,0
2015-01-07,2417.7,1318166.0,0,,0,,0
2015-01-08,2443.8,782704.0,1,0.0,1,,0
2015-01-09,2512.3,1598821.0,1,1.0,0,1.0,0
2015-01-10,2511.433333,1331882.0,1,0.0,0,,0


In [18]:
fg = tcs_data.loc[tcs_data['Price_shock_without_volume_shock'] == 1]
fg.index[-1]

Timestamp('2015-09-30 00:00:00')

## Bokeh Plot after interpolating data for weekends

In [19]:
fig = figure(plot_width= 900,plot_height=600, title = "TCS Stock", x_axis_label='X',y_axis_label='Y',x_axis_type = 'datetime')

fig.line(tcs_data.index,tcs_data['Close'], color='blue', line_width= 2, alpha=1)

output_notebook()
show(fig)

## Calculating rolling window average for different window sizes on daily basis

In [20]:
def daily_rolling_average():
    
    
    for x in [10,30,50,75]:
        roll_avg[str(x) + 'days'] =tcs_data['Close'].rolling(x).mean()
        
        fig = figure(plot_width= 500,plot_height=500, title = "Rolling average for "+ str(x) +"days", 
                     x_axis_label='X',y_axis_label='Y',x_axis_type = 'datetime')
        fig.line(tcs_data.index,roll_avg[str(x) + 'days'], color='blue', line_width= 2, alpha=1)  
        
        output_notebook()
        show(fig)

    return roll_avg

roll_avg = pd.DataFrame()
roll_avg = daily_rolling_average()
#roll_avg[45:80]

This clearly shows that as the rolling window size increses, the sensitivity decreses.And for small window sizes, there is a large amount of noise contained in it.So, our task is to select the window size that maximizes predictive accuracy, which is the predictive value minus the predictive error.