In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import combinations, repeat
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
nb_query = 20
query = np.array([i+1 for i in range(nb_query) for x in range(int(np.ceil(np.abs(np.random.normal(0,scale=15))+2)))])
doc_features = np.random.random((len(query), 10))
doc_scores = np.random.randint(5, size=len(query)).astype(np.float32)

In [5]:
query.shape

(278,)

In [6]:
doc_scores.shape

(278,)

In [8]:
doc_features.shape

(278, 10)

In [13]:
doc_scores.shape

(284,)

In [4]:
# put data into pairs
xi = []
xj = []
pij = []
pair_id = []
pair_query_id = []
for q in np.unique(query):
    query_idx = np.where(query == q)[0]
    for pair_idx in combinations(query_idx, 2):
        pair_query_id.append(q)
        
        pair_id.append(pair_idx)
        i = pair_idx[0]
        j = pair_idx[1]
        xi.append(doc_features[i])
        xj.append(doc_features[j])
        
        if doc_scores[i] == doc_scores[j]:
            _pij = 0.5
        elif doc_scores[i] > doc_scores[j]:
            _pij = 1
        else: 
            _pij = 0
        pij.append(_pij)
        
xi = np.array(xi)
xj = np.array(xj)
pij = np.array(pij)
pair_query_id = np.array(pair_query_id)

xi_train, xi_test, xj_train, xj_test, pij_train, pij_test, pair_id_train, pair_id_test = train_test_split(
    xi, xj, pij, pair_id, test_size=0.2, stratify=pair_query_id)

In [10]:
xi.shape

(3289, 10)

In [20]:
pair_idx_test = np.where(query==query_test)[0]

In [22]:
list(combinations(pair_idx_test, 2))

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 3),
 (2, 4),
 (3, 4)]

## 1 Queryfying The Stock Picking Problem: A Large Dataset Problem

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import combinations, repeat
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
stock_dataset = pd.read_csv(
    "data/config/20220831_last_enhanced_dataset_monthly.csv",
    parse_dates=["Date"]
)

### 1.1 Features

In [3]:
stock_dataset

Unnamed: 0,Date,symbols,RRET3M,RRET6M,RRET12M,SRET3M,SRET6M,SRET12M,ISR3M,ISR6M,ISR12M,EWVOL3M,ISR12M_nexmonth
0,2007-10-31,A,-0.045831,0.010974,0.120625,-0.286889,0.048575,0.377537,-0.043293,0.218709,0.592724,0.252168,0.648748
1,2007-10-31,AA,0.049675,0.139174,0.442392,0.261090,0.517240,1.162594,0.694973,0.873066,1.274447,0.347691,0.747572
2,2007-10-31,AAPL,0.391677,0.891933,1.361976,1.943524,3.129527,3.379097,3.461684,3.534918,2.782748,0.371120,2.032981
3,2007-10-31,ABC,0.027289,-0.055854,0.041181,0.187520,-0.271396,0.141491,0.510378,-0.021059,0.413389,0.266703,0.164794
4,2007-10-31,ABT,0.073018,-0.036949,0.187770,0.665704,-0.238196,0.855951,1.383667,-0.011155,0.947518,0.191821,1.151533
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83477,2022-07-29,XRX,-0.007128,-0.137710,-0.257794,-0.031321,-0.427877,-0.566385,0.162517,-0.085528,-0.090846,0.475500,-0.069290
83478,2022-07-29,Y,-0.005002,0.303045,0.271664,-0.343658,14.722666,9.332482,-0.000619,1.505580,0.904912,0.188267,0.910676
83479,2022-07-29,YUM,0.030349,0.032540,-0.043256,0.242966,0.184209,-0.173151,0.597081,0.375240,-0.009783,0.232148,-0.027119
83480,2022-07-29,ZBH,-0.112113,-0.058252,-0.295957,-0.759136,-0.278908,-1.001985,-0.129716,-0.021664,-0.075548,0.281217,-0.073511


In [4]:
stock_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83482 entries, 0 to 83481
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             83482 non-null  datetime64[ns]
 1   symbols          83482 non-null  object        
 2   RRET3M           83482 non-null  float64       
 3   RRET6M           83482 non-null  float64       
 4   RRET12M          83482 non-null  float64       
 5   SRET3M           83482 non-null  float64       
 6   SRET6M           83482 non-null  float64       
 7   SRET12M          83482 non-null  float64       
 8   ISR3M            83482 non-null  float64       
 9   ISR6M            83482 non-null  float64       
 10  ISR12M           83482 non-null  float64       
 11  EWVOL3M          83482 non-null  float64       
 12  ISR12M_nexmonth  83482 non-null  float64       
dtypes: datetime64[ns](1), float64(11), object(1)
memory usage: 8.3+ MB


In [5]:
import datetime as dt

### 1.2 Rebalance Date As a `date_query_id`

In [6]:
stock_dataset["date_query_id"] = stock_dataset["Date"].dt.strftime("%Y%m%d").astype(int)

In [7]:
stock_dataset.head()

Unnamed: 0,Date,symbols,RRET3M,RRET6M,RRET12M,SRET3M,SRET6M,SRET12M,ISR3M,ISR6M,ISR12M,EWVOL3M,ISR12M_nexmonth,date_query_id
0,2007-10-31,A,-0.045831,0.010974,0.120625,-0.286889,0.048575,0.377537,-0.043293,0.218709,0.592724,0.252168,0.648748,20071031
1,2007-10-31,AA,0.049675,0.139174,0.442392,0.26109,0.51724,1.162594,0.694973,0.873066,1.274447,0.347691,0.747572,20071031
2,2007-10-31,AAPL,0.391677,0.891933,1.361976,1.943524,3.129527,3.379097,3.461684,3.534918,2.782748,0.37112,2.032981,20071031
3,2007-10-31,ABC,0.027289,-0.055854,0.041181,0.18752,-0.271396,0.141491,0.510378,-0.021059,0.413389,0.266703,0.164794,20071031
4,2007-10-31,ABT,0.073018,-0.036949,0.18777,0.665704,-0.238196,0.855951,1.383667,-0.011155,0.947518,0.191821,1.151533,20071031


In [8]:
stock_features = stock_dataset[['RRET3M', 'RRET6M', 'RRET12M', 'SRET3M', 'SRET6M', 'SRET12M', 'ISR3M', 'ISR6M', 'ISR12M', 'EWVOL3M']]
stock_features

Unnamed: 0,RRET3M,RRET6M,RRET12M,SRET3M,SRET6M,SRET12M,ISR3M,ISR6M,ISR12M,EWVOL3M
0,-0.045831,0.010974,0.120625,-0.286889,0.048575,0.377537,-0.043293,0.218709,0.592724,0.252168
1,0.049675,0.139174,0.442392,0.261090,0.517240,1.162594,0.694973,0.873066,1.274447,0.347691
2,0.391677,0.891933,1.361976,1.943524,3.129527,3.379097,3.461684,3.534918,2.782748,0.371120
3,0.027289,-0.055854,0.041181,0.187520,-0.271396,0.141491,0.510378,-0.021059,0.413389,0.266703
4,0.073018,-0.036949,0.187770,0.665704,-0.238196,0.855951,1.383667,-0.011155,0.947518,0.191821
...,...,...,...,...,...,...,...,...,...,...
83477,-0.007128,-0.137710,-0.257794,-0.031321,-0.427877,-0.566385,0.162517,-0.085528,-0.090846,0.475500
83478,-0.005002,0.303045,0.271664,-0.343658,14.722666,9.332482,-0.000619,1.505580,0.904912,0.188267
83479,0.030349,0.032540,-0.043256,0.242966,0.184209,-0.173151,0.597081,0.375240,-0.009783,0.232148
83480,-0.112113,-0.058252,-0.295957,-0.759136,-0.278908,-1.001985,-0.129716,-0.021664,-0.075548,0.281217


### 1.3 Ventile-Based Scoring

In [9]:
scores = pd.read_csv(
    "data/config/20220831_lagged_isharpe_target_monthly.csv",
    parse_dates=["Date"],
    usecols=["Date","symbols","ISC12M"]
)
scores

Unnamed: 0,Date,symbols,ISC12M
0,2007-09-28,A,11.0
1,2007-09-28,AA,15.0
2,2007-09-28,AAPL,19.0
3,2007-09-28,ABC,9.0
4,2007-09-28,ABT,13.0
...,...,...,...
83946,2022-07-29,XRX,5.0
83947,2022-07-29,Y,18.0
83948,2022-07-29,YUM,8.0
83949,2022-07-29,ZBH,4.0


In [10]:
stock_dataset["Date"].iloc[0]

Timestamp('2007-10-31 00:00:00')

In [11]:
# cropping for aligning with return-based features
scores = scores[scores["Date"]>="2007-10-31"]

In [12]:
stock_features.values[0]

array([-0.04583108,  0.01097434,  0.12062476, -0.28688855,  0.0485754 ,
        0.3775371 , -0.04329277,  0.21870868,  0.59272382,  0.25216777])

In [13]:
scores["ISC12M"].values

array([14., 15., 19., ...,  8.,  4., 12.])

In [1]:
%whos

Variable   Type      Data/Info
------------------------------
os         module    <module 'os' from '/home/<...>da3/lib/python3.9/os.py'>
sys        module    <module 'sys' (built-in)>


In [14]:
# put data into pairs
xi = []
xj = []
pij = []
pair_id = []
pair_query_id = []

features = stock_features.values
date_query = stock_dataset["date_query_id"].astype(int).values
stock_scores = scores["ISC12M"].astype(int).values

In [17]:
del scores, stock_dataset, stock_features

In [18]:
for q in np.unique(date_query):
    query_idx = np.where(date_query == q)[0]
    for pair_idx in combinations(query_idx, 2):
        pair_query_id.append(q)
        
        pair_id.append(pair_idx)
        i = pair_idx[0]
        j = pair_idx[1]
        xi.append(features[i])
        xj.append(features[j])
        
        if stock_scores[i] == stock_scores[j]:
            _pij = 0.5
        elif stock_scores[i] > stock_scores[j]:
            _pij = 1
        else: 
            _pij = 0
        pij.append(_pij)
        
xi = np.array(xi)
xj = np.array(xj)
pij = np.array(pij)
pair_query_id = np.array(pair_query_id)

: 

: 