In [1]:
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm



In [17]:
def gen_ran_data(n_stock,n_period,n_feat):
    
    X = np.vstack([np.vstack([np.random.permutation(np.linspace(-1,1,n_stock)) for _ in range(n_feat)]).T for n in range(n_period)])
    cols=[f"f{_}" for _ in range(n_feat)]
    sample_df=pd.DataFrame(X,columns=cols)
    dates = pd.date_range('2000-01-01', periods=n_period, freq='MS')
    sample_df["date"] = [date for date in dates for _ in range(n_stock)]
    sample_df["PERMNO"] = list(range(n_stock))*n_period
    sample_df["ret"] = np.random.randn(n_stock*n_period)
    sample_df["weight"] = np.random.rand(n_stock*n_period)
    sample_df["weight"] = sample_df["weight"]/sample_df.groupby("date")["weight"].transform("sum")
    
    return sample_df
    

In [1]:
def Sharpe(X):
    
    n = X.shape[0]
    X = X.values.reshape(n, -1) #reshape to 2d array in case X is 1d
    y = np.ones(n)
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
    residuals = y - X @ beta
    rss = np.sum(residuals**2)
    sr =  (n / rss - 1)**0.5
    
    return sr

def filter_panel_by_to_node(node_ind,sample_df,max_node):
    for parent_node_no in range(max_node):
        temp_df=sample_df
        if row[f"node_{parent_node}_parent"] == "left":
            temp_df=temp_df[temp_df[f"split_{parent_node_no}"]]
        elif row[f"node_{parent_node}_parent"] == "right":
            temp_df=temp_df[~temp_df[f"split_{parent_node_no}"]]
    return temp_df

def split_and_gen_factor(node_sample_df,feat,thres,lr):
    
    if lr == "left":
        new_df=node_sample_df[node_sample_df[feat]>=thres]["ret","weight"]
    elif lr == "right":
        new_df=node_sample_df[node_sample_df[feat]<thres]["ret","weight"]
    new_factor=new_df.groupby("date").apply(lambda x: x["ret"] * x["weight"]/x["weight"].sum()).sum()
    
    return new_factor

def gen_tree(sample_df,n_period,n_stock,n_feat,thres_list,max_node):
    tree_df=pd.DataFrame([[999,999,True]+["Irr"]*max_node], columns=["char","thres","leaf"]+[f"node_{i}_parent" for i in range(max_node)])
    factor_df=pd.DataFrame()
    feat_list=[f"f{_}" for _ in range(n_feat)]
    log_df=pd.DataFrame()
    for split in range(max_node-1):
        for ind,row in tree_df[tree_df["leaf"]].iterrows():
            node_sample_df=filter_panel_by_to_node(node_ind,sample_df,max_node)
            for feat in feat_list:
                for thres in thres_list:
                        left_factor=split_and_gen_factor(node_sample_df,feat,thres,"left")
                        right_factor=split_and_gen_factor(node_sample_df,feat,thres,"right")
                        sr=Sharpe(pd.concat([left_factor,right_factor,factor_df],axis=1))
                        log_df=log_df.append({"char":feat,"thres":thres,"split":split,"node":ind,"sr":sr},ignore_index=True)
                        
        best_split=log_df[(log_df["split"]==split)]
        best_split=best_split[(best_split["sr"]==best_split["sr"].max())].iloc[0]
        
        new_left_node={"char":best_split["char"],"thres":best_split["thres"],"leaf":True}
        new_left_node.update(tree_df[best_split["node"]][[f"node_{i}_parent" for i in range(max_node)]].to_dict())
        new_left_node.update({f"node_{best_split['node']}_parent":"left"})
        tree_df.append(new_left_node,ignore_index=True)
        
        new_right_node={"char":best_split["char"],"thres":best_split["thres"],"leaf":True}
        new_right_node.update(tree_df[best_split["node"]][[f"node_{i}_parent" for i in range(max_node)]].to_dict())
        new_right_node.update({f"node_{best_split['node']}_parent":"right"})
        tree_df.append(new_right_node,ignore_index=True)
        
        tree_df.loc[best_split,"leaf"]=False
        new_left_panel=filter_panel_by_to_node(new_left_node,sample_df,max_node)
        new_left_factor=split_and_gen_factor(new_left_panel,new_left_node["char"],new_left_node["thres"],"left")
        factor_df[best_split["ind"]+1]=new_left_factor
        new_right_panel=filter_panel_by_to_node(new_right_node,sample_df,max_node)
        new_right_factor=split_and_gen_factor(new_right_panel,new_right_node["char"],new_right_node["thres"],"right")      
        factor_df[best_split["ind"]+2]=new_left_factor
        

In [8]:
    X = [np.stack([np.random.permutation(np.linspace(-1,1,5)).T for _ in range(5)],axis=0) for p in range(2)]
    X = np.block(X).T
    X

array([[-0.5,  0. ],
       [ 0. , -1. ],
       [ 1. , -0.5],
       [ 0.5,  1. ],
       [-1. ,  0.5],
       [ 1. , -1. ],
       [ 0.5,  0.5],
       [ 0. , -0.5],
       [-1. ,  1. ],
       [-0.5,  0. ],
       [-0.5, -1. ],
       [ 1. ,  0. ],
       [ 0.5,  1. ],
       [-1. , -0.5],
       [ 0. ,  0.5],
       [ 0. ,  0. ],
       [ 0.5, -0.5],
       [-1. ,  0.5],
       [-0.5, -1. ],
       [ 1. ,  1. ],
       [ 0. ,  0.5],
       [-1. , -1. ],
       [ 0.5,  1. ],
       [ 1. ,  0. ],
       [-0.5, -0.5]])

In [16]:
np.vstack([np.vstack([np.random.permutation(np.linspace(-1,1,5)) for _ in range(5)]).T for n in range(2)])

array([[ 1. , -0.5,  0.5, -1. ,  1. ],
       [-1. ,  0.5,  0. ,  0. , -1. ],
       [ 0. , -1. , -1. , -0.5,  0. ],
       [-0.5,  0. , -0.5,  1. , -0.5],
       [ 0.5,  1. ,  1. ,  0.5,  0.5],
       [ 0.5, -0.5, -0.5,  0.5, -1. ],
       [ 0. ,  0. , -1. , -0.5, -0.5],
       [-0.5,  0.5,  1. , -1. ,  1. ],
       [-1. , -1. ,  0.5,  0. ,  0. ],
       [ 1. ,  1. ,  0. ,  1. ,  0.5]])

In [1]:
def add_loss_weight(sample_df):
    
    weights=sample_df.groupby(["date"])["RET"].count()
    weights=1/weights
    weights.rename("loss_weight",inplace=True)
    sample_df=pd.merge(left=sample_df,right=weights,left_on="date",right_index=True,how="left",validate="m:1")
    
    return sample_df

In [11]:
train_df=pd.read_csv("/mnt/work/hc2235/Panel_Tree_replication/data_preparation/output/weighted_trainp.csv",index_col=0)
train_df.drop(columns=["loss_weight"],inplace=True)
train_df.to_csv("/mnt/work/hc2235/Panel_Tree_replication/data_preparation/output/weighted_trainp.csv")


In [8]:
    import pandas as pd
    import os
    os.curdir="/mnt/work/hc2235/Panel_Tree_replication/data_preparation/code"
    train_df=pd.read_csv(os.path.join("/mnt/work/hc2235/Panel_Tree_replication/data_preparation/output","weighted_trainp_loss_weight.csv"),index_col=0)
    test_df=pd.read_csv(os.path.join("/mnt/work/hc2235/Panel_Tree_replication/data_preparation/output","weighted_testp_loss_weight.csv"),index_col=0)
    
    train_df_toy=train_df.drop(columns=[f"f{_}" for _ in range(4,51)])
    test_df_toy=test_df.drop(columns=[f"f{_}" for _ in range(4,51)])
    
    train_df_toy.to_csv(os.path.join("/mnt/work/hc2235/Panel_Tree_replication/test_sample_generation","weighted_trainp_loss_weight_toy.csv"))
    test_df_toy.to_csv(os.path.join("/mnt/work/hc2235/Panel_Tree_replication/test_sample_generation","weighted_testp_loss_weight_toy.csv"))    

In [6]:
os.listdir("..")

['.git',
 '.gitattributes',
 '.gitignore.txt',
 'LICENSE',
 'data_preparation',
 'grow_tree',
 'raw_data',
 'table_preparation',
 'test_sample_generation',
 '.gitignore',
 'README.md',
 'master_script.sh']

In [9]:
test_df=pd.read_csv("/mnt/work/hc2235/Panel_Tree_replication/data_preparation/output/weighted_testp.csv",index_col=0)
test_df=add_loss_weight(test_df)
test_df.to_csv("/mnt/work/hc2235/Panel_Tree_replication/data_preparation/output/weighted_testp_loss_weight.csv")
