# On this notebook the test and training sets will be defined.

In [None]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../')

## Let's test the scikit learn example for TimeSeriesSplit (with some modifications)

In [None]:
from sklearn.model_selection import TimeSeriesSplit
num_samples = 30
dims = 2

X = np.random.random((num_samples,dims))
y = np.array(range(num_samples))
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)  
TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(X):
    print("TRAIN_indexes:", train_index, "TEST_indexes:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

### It may be useful for validation purposes. The test set will be separated before, anyway. The criterion to follow is to always keep causality.

## Let's get the data and preserve one part as the test set.

Note: The way the test set will be used, is still not defined. Also, the definition of X and y may depend on the length of the base time interval used for training. But, in any case, it is a good practise to separate a fraction of the data for test, that will be untouched regardless of all those decisions.

In [None]:
data_df = pd.read_pickle('../../data/data_df.pkl')
print(data_df.shape)
data_df.head(10)

### I will save about two years worth of data for the test set (it wouldn't be correct to save a fixed fraction of the total set because the size of the "optimal" training set is still to be defined; I may end up using much less than the total dataset).

In [None]:
num_test_samples = 252 * 2

data_train_val_df, data_test_df = data_df.unstack().iloc[:-num_test_samples], data_df.unstack().iloc[-num_test_samples:] 

In [None]:
def show_df_basic(df):
    print(df.shape)
    print('Starting value: %s\nEnding value: %s' % (df.index.get_level_values(0)[0], df.index.get_level_values(0)[-1]))
    print(df.head())

In [None]:
show_df_basic(data_train_val_df)

In [None]:
show_df_basic(data_test_df)

### I could select the Close values, for example, like below...

In [None]:
data_test_df.loc[slice(None),(slice(None),'Close')].head()

### Or like this...

In [None]:
data_test_df.xs('Close', level=1, axis=1).head()

### But I think it will be more clear if I swap the levels in the columns

In [None]:
data_train_val_df = data_train_val_df.swaplevel(0, 1, axis=1).stack().unstack()
show_df_basic(data_train_val_df)
data_test_df = data_test_df.swaplevel(0, 1, axis=1).stack().unstack()
show_df_basic(data_test_df)

## Now it's very easy to select one of the features:

In [None]:
data_train_val_df['Close']

## Let's pickle the data

In [None]:
data_train_val_df.to_pickle('../../data/data_train_val_df.pkl')
data_test_df.to_pickle('../../data/data_test_df.pkl')

## No validation set will be needed as I will use "time" cross-validation for that.