# Adjusted Price and Cross-Validation

In [60]:
import pandas as pd
import numpy as np
import datetime
import time

from itertools import islice

from decimal import ROUND_HALF_UP, Decimal

In [2]:
TRAIN_DIR = "data/train_files"

In [None]:
# reading stock prices data

stock_prices = pd.read_csv(f"{TRAIN_DIR}/stock_prices.csv", parse_dates=['Date'])

## Adjusted Prices

In [3]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    return price

In [5]:
# generate AdjustedClose

stock_prices = adjust_price(stock_prices)

In [6]:
stock_prices.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CumulativeAdjustmentFactor,AdjustedClose
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073,1.0,2742.0
1,20170105_1301,2017-01-05,1301,2743.0,2747.0,2735.0,2738.0,17900,1.0,,False,0.00292,1.0,2738.0
2,20170106_1301,2017-01-06,1301,2734.0,2744.0,2720.0,2740.0,19900,1.0,,False,-0.001092,1.0,2740.0
3,20170110_1301,2017-01-10,1301,2745.0,2754.0,2735.0,2748.0,24200,1.0,,False,-0.0051,1.0,2748.0
4,20170111_1301,2017-01-11,1301,2748.0,2752.0,2737.0,2745.0,9300,1.0,,False,-0.003295,1.0,2745.0


In [7]:
stock_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2332531 entries, 0 to 2332530
Data columns (total 14 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   RowId                       object        
 1   Date                        datetime64[ns]
 2   SecuritiesCode              int64         
 3   Open                        float64       
 4   High                        float64       
 5   Low                         float64       
 6   Close                       float64       
 7   Volume                      int64         
 8   AdjustmentFactor            float64       
 9   ExpectedDividend            float64       
 10  SupervisionFlag             bool          
 11  Target                      float64       
 12  CumulativeAdjustmentFactor  float64       
 13  AdjustedClose               float64       
dtypes: bool(1), datetime64[ns](1), float64(9), int64(2), object(1)
memory usage: 233.6+ MB


## Cross-validation

In [36]:
class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError("The 'groups' parameter should not be None")
        
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(("Cannot have number of folds={0} greater than"" the number of groups={1}").format(n_folds, n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError("The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[int(split_dict[split][0]) - self.purge:int(split_dict[split][0])].tolist()
                banned_groups += unique_groups[int(split_dict[split][-1]) + 1:int(split_dict[split][-1]) + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

            
    def grouping(self, elements, no_groups):
        k, m = divmod(len(elements), no_groups)
        splitted = list(elements[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(no_groups))
        for index, value in enumerate(splitted):
            for j in range(len(value)):
                splitted[index][j] = index
        labels = []
        for arr in splitted:
            labels = np.concatenate((labels, arr))
        return labels

## Evaluation

In [87]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        print(df.shape, " ", len(weights))
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [80]:
# split data into TRAIN and TEST by date
TRAIN_END = datetime.datetime(2019, 12, 31)
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = datetime.datetime(2020, 1, 6)

train_data = stock_prices[stock_prices["Date"] < TRAIN_END]

# handling datatime info
train_data["Year"] = train_data["Date"].dt.year
train_data["Month"] = train_data["Date"].dt.month
train_data["Day"] = train_data["Date"].dt.day

train_data = train_data.drop(['RowId', 'Date'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Year"] = train_data["Date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Month"] = train_data["Date"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Day"] = train_data["Date"].dt.day


In [82]:
# we are gonna use DecisionTree Model as an example
from sklearn import tree

clf = tree.DecisionTreeRegressor()

In [88]:
# cv hyperparameters
n_splits = 6
n_test_splits = 1
no_groups = 12

kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
elements = train_data.index.tolist()
groups = kfold.grouping(elements, no_groups)
data = pd.DataFrame({"group": groups, "element": elements})

sharpe_ratios = []
start = time.time()
for index, (train_indices, test_indices) in enumerate(kfold.split(data, groups=data["group"])):
    train = train_data.iloc[train_indices]
    test = train_data.iloc[test_indices]
    train = train.dropna(); test = test.dropna()
    train_X, train_y = train.drop(['Target'],axis=1), train["Target"].to_numpy()
    test_X, test_y = test.drop(['Target'],axis=1), test["Target"].to_numpy()
    clf.fit(train_X, train_y)
    pred = clf.predict(test_X)
    test_X["Target"] = pred.tolist()
    
    test_X["Date"] = test_X[["Year", "Month", "Day"]].apply(lambda x: "_".join(str(x)), axis=1)
    test_X["Rank"] = test_X.groupby("Date")["Target"].rank(ascending=False,method="first") -1 
    test_X["Rank"] =test_X["Rank"].astype("int")
    pred_sharpe = calc_spread_return_sharpe(test_X, portfolio_size=200, toprank_weight_ratio=2)
    sharpe_ratios.append(pred_sharpe)
    
stop = time.time()

print()

(1, 17)   200


ValueError: Length of values (200) does not match length of index (1)

In [62]:
train_data.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CumulativeAdjustmentFactor,AdjustedClose
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073,1.0,2742.0
1,20170105_1301,2017-01-05,1301,2743.0,2747.0,2735.0,2738.0,17900,1.0,,False,0.00292,1.0,2738.0
2,20170106_1301,2017-01-06,1301,2734.0,2744.0,2720.0,2740.0,19900,1.0,,False,-0.001092,1.0,2740.0
3,20170110_1301,2017-01-10,1301,2745.0,2754.0,2735.0,2748.0,24200,1.0,,False,-0.0051,1.0,2748.0
4,20170111_1301,2017-01-11,1301,2748.0,2752.0,2737.0,2745.0,9300,1.0,,False,-0.003295,1.0,2745.0


In [63]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1399689 entries, 0 to 2332061
Data columns (total 14 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   RowId                       1399689 non-null  object        
 1   Date                        1399689 non-null  datetime64[ns]
 2   SecuritiesCode              1399689 non-null  int64         
 3   Open                        1396317 non-null  float64       
 4   High                        1396317 non-null  float64       
 5   Low                         1396317 non-null  float64       
 6   Close                       1396317 non-null  float64       
 7   Volume                      1399689 non-null  int64         
 8   AdjustmentFactor            1399689 non-null  float64       
 9   ExpectedDividend            11322 non-null    float64       
 10  SupervisionFlag             1399689 non-null  bool          
 11  Target                  

In [52]:
train_now = []
test_now = []

for index, (train_indices, test_indices) in enumerate(kfold.split(data, groups=data["group"])):
#     print("=" * 100)
#     print(f"Fold {index}")
#     print("=" * 100)
#     print("Train indices:", train_indices, "Length:", len(train_indices))
#     print("Test Indices:", test_indices, "Length:", len(test_indices))
    train_now = train.iloc[train_indices]
    test_now = train.iloc[test_indices]
    