## `IMPORTING LIBRARIES, READING DATA & PLOTTING TIME SERIES`

In [9]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
import plotly.express as px
import os 

warnings.filterwarnings('ignore')

In [52]:
# Reading data 

data = pd.read_csv('train_data_complete.csv')
data.head()

Unnamed: 0,Date,count
0,2012-08-25,3
1,2012-08-26,3
2,2012-08-27,2
3,2012-08-28,2
4,2012-08-29,2


In [53]:
# Renaming columns

data.rename(columns = {
    'Date' : 'Date',
    'count': 'Count'
}, inplace = True)

In [54]:
# Checking the datatypes 

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    762 non-null    object
 1   Count   762 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 12.0+ KB


In [55]:
# Changing type of date column

data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
data.dtypes

Date     datetime64[ns]
Count             int64
dtype: object

In [56]:
# Plotting the time series using plotly 

fig = px.line(data_frame = data,
              x = 'Date',
              y = 'Count',
              title = 'Time Series with Range Slider')

fig.update_xaxes(rangeslider_visible=True)

fig.update_layout(xaxis_title = 'Date',
                  yaxis_title = 'Count', 
                  font=dict(family="Courier New monospace", size=14, color="RebeccaPurple"))

fig.show()

## `HOLD-OUT VALIDATION `

In [57]:
# Dividing data - Train & Valid 

train_data = data[:534]
valid_data = data[534:]

In [58]:
# Checking the shape of the data 

print(train_data.shape), print(valid_data.shape)

(534, 2)
(228, 2)


(None, None)

In [135]:
import plotly.graph_objects as go
from plotly.offline import iplot

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=train_data['Date'],
    y=train_data['Count'],
    name="Training Data"       
))


fig.add_trace(go.Scatter(
    x=valid_data['Date'],
    y=valid_data['Count'],
    name="Validation Data"
))

fig.update_layout(
    title="Hold Out Validation",
    xaxis_title="Date",
    yaxis_title="Count",
    legend_title="Legend Title",
    font=dict(family="Courier New, monospace", size=18, color="RebeccaPurple")
)

fig.update_xaxes(rangeslider_visible=True)

#fig.show()
iplot(fig)

## `TIME-SERIES CROSS VALIDATION`

In [116]:
# Importing the library 

from sklearn.model_selection import TimeSeriesSplit
time_series_cv = TimeSeriesSplit(n_splits=5)

In [117]:
# Splitting the data

for train_index, test_index in time_series_cv.split(data):
    print('----- ----- ----')
    print("TRAIN:", train_index[0], 'to', train_index[-1])
    print("TEST:",  test_index[0], 'to',  test_index[-1])

----- ----- ----
TRAIN: 0 to 126
TEST: 127 to 253
----- ----- ----
TRAIN: 0 to 253
TEST: 254 to 380
----- ----- ----
TRAIN: 0 to 380
TEST: 381 to 507
----- ----- ----
TRAIN: 0 to 507
TEST: 508 to 634
----- ----- ----
TRAIN: 0 to 634
TEST: 635 to 761


In [136]:
# Ploting the time series cross validation data

from plotly.offline import iplot

for train_index, test_index in time_series_cv.split(data):
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=data['Date'],
        y=data['Count'],
        name="Complete Data"       
    ))

    fig.add_trace(go.Scatter(
        x=data['Date'][train_index[0]:train_index[-1]],
        y=data['Count'][train_index[0]:train_index[-1]],
        name="Training Data"
    ))
    
    fig.add_trace(go.Scatter(
        x=data['Date'][test_index[0]:test_index[-1]],
        y=data['Count'][test_index[0]:test_index[-1]],
        name="Validation Data"
    ))

    fig.update_layout(
        title="Time Series Cross Validation",
        xaxis_title="Date",
        yaxis_title="Count",
        legend_title="Legend Title",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="RebeccaPurple"
        )
    )
    
    fig.update_xaxes(rangeslider_visible=True)

    #fig.show()
    iplot(fig)

## `WALK FORWARD CROSS VALIDATION`

In [131]:
# Splitting the data

for train_index, test_index in time_series_cv.split(data):
    print('----- ----- ----')
    print("TRAIN:", train_index[-1]-126, 'to', train_index[-1])
    print("TEST:",  test_index[0], 'to',  test_index[-1])

----- ----- ----
TRAIN: 0 to 126
TEST: 127 to 253
----- ----- ----
TRAIN: 127 to 253
TEST: 254 to 380
----- ----- ----
TRAIN: 254 to 380
TEST: 381 to 507
----- ----- ----
TRAIN: 381 to 507
TEST: 508 to 634
----- ----- ----
TRAIN: 508 to 634
TEST: 635 to 761


In [137]:
# Ploting the time series cross validation data

from plotly.offline import iplot

for train_index, test_index in time_series_cv.split(data):
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=data['Date'],
        y=data['Count'],
        name="Complete Data"       
    ))

    fig.add_trace(go.Scatter(
        x=data['Date'][train_index[-1]-126:train_index[-1]],
        y=data['Count'][train_index[-1]-126:train_index[-1]],
        name="Training Data"
    ))
    
    fig.add_trace(go.Scatter(
        x=data['Date'][test_index[-1]-126:test_index[-1]],
        y=data['Count'][test_index[-1]-126:test_index[-1]],
        name="Validation Data"
    ))

    fig.update_layout(
        title="Time Series Cross Validation",
        xaxis_title="Date",
        yaxis_title="Count",
        legend_title="Legend Title",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="RebeccaPurple"
        )
    )
    
    fig.update_xaxes(rangeslider_visible=True)

    #fig.show()
    iplot(fig)