## Import Libraries and Log-in to the platform

In [None]:
# !pip install -r requirements.txt
# !pip install urllib

In [None]:
import os 
import utils
import urllib
import requests
import pandas as pd
import random as rand
import plotly.express as px
import helpful_functions as hf

from tqdm import tqdm

In [None]:
DIR_SINGLE_SIGNAL = "./1_SINGLE_SIGNAL"
DIR_SIGNAL_COMBINATION = "./2_SIGNAL_COMBINATION"
DIR_SIGNAL_NEUTRALIZATION = "./3_SIGNAL_NEUTRALIZATION"
DIR_IMPROVEMENT = "./4_IMPROVEMENT"
DATA_COVERAGE = 0.75
BUSINESS_DAY_1_YEAR = 252
BUSINESS_DAY_6_MONTHS = 126

if not os.path.exists(DIR_SINGLE_SIGNAL): os.makedirs(DIR_SINGLE_SIGNAL)
if not os.path.exists(DIR_SIGNAL_COMBINATION): os.makedirs(DIR_SIGNAL_COMBINATION)
if not os.path.exists(DIR_SIGNAL_NEUTRALIZATION): os.makedirs(DIR_SIGNAL_NEUTRALIZATION) 

s = utils.start_session()

## Download datasets

- Dataset ∋ Datafield

In [None]:
REGION = 'USA'; DELAY = 1; UNIVERSE = 'TOP3000'

datasets_df = hf.get_datasets(
    s,
    region = REGION,
    delay = DELAY,
    universe = UNIVERSE
) 

In [None]:
selected_datasets_df = datasets_df.query(
        """
        delay == 1 &\
        region == 'USA' &\
        universe == 'TOP3000' 
        """, 
        engine='python').sort_values(by=['valueScore'], ascending=False)


In [None]:
selected_datasets_df.sort_values('valueScore',ascending=False)

Unnamed: 0,id,name,description,category,subcategory,region,delay,universe,coverage,turnover,valueScore,userCount,alphaCount,fieldCount,researchPapers
14,univ1,Universe Dataset,No dataset description,pv,pv-price-volume,USA,1,TOP3000,0.448,,3.0,13,25,5,[]
7,news18,Ravenpack News Data,This dataset provides news sentiment and other...,news,news-news-sentiment,USA,1,TOP3000,0.6779,,2.0,586,7880,75,[{'title': 'Research Paper 01: The Momentum of...
0,analyst4,Analyst Estimate Data for Equity,This dataset provides details and aggregations...,analyst,analyst-analyst-estimates,USA,1,TOP3000,0.6036,,1.0,3527,51966,350,[]
1,fundamental2,Report Footnotes,This dataset holds fundamental items included ...,fundamental,fundamental-footnotes,USA,1,TOP3000,0.4089,,1.0,3427,30340,318,[]
2,fundamental6,Company Fundamental Data for Equity,Fundamental database covers most of the world'...,fundamental,fundamental-fundamental-data,USA,1,TOP3000,0.7357,,1.0,22727,669992,914,[{'title': 'Research Paper 03: Cross-Firm Info...
3,model16,Fundamental Scores,This dataset ranks stocks based on fundamental...,model,model-valuation-models,USA,1,TOP3000,0.3077,,1.0,133,635,8,[{'title': 'Research Paper 01: The Momentum of...
4,model38,Growth Valuation Model,This dataset is a stock ranking model that sor...,model,model-valuation-models,USA,1,TOP3000,0.7192,,1.0,451,22408,57,[]
5,model51,Systematic Risk Metrics,This is a risk-model data offering several met...,model,model-risk-models,USA,1,TOP3000,0.7724,,1.0,1602,5089,16,[]
6,news12,US News Data,This dataset specializes in matching financial...,news,news-news,USA,1,TOP3000,0.8034,,1.0,5068,40993,322,[]
8,option8,Volatility Data,This is an option dataset which provides histo...,option,option-option-volatility,USA,1,TOP3000,0.6948,,1.0,2127,27982,64,[]


## 1. Single Signal Browsing in Each Datafield

In [None]:
for dataset_id in selected_datasets_df.sort_values('valueScore',ascending=False).id.values:    

    # 1) LOAD DATAFIELDS
    datafields_df = hf.get_datafields(s, dataset_id=dataset_id).query(f"{DATA_COVERAGE} < coverage")
    
    # 2) MAKE EXPRESSION LIST
    # ts_backfill : to backfill empty data
    # 5 operators : rank, scale, ts_skewness, ts_rank, ts_zscore
    expression_list = \
    [f'rank( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}) )' for datafield in datafields_df.iloc]+\
    [f'scale( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}) )' for datafield in datafields_df.iloc]+\
    [f'ts_skewness( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}),{BUSINESS_DAY_6_MONTHS} )' for datafield in datafields_df.iloc]+\
    [f'ts_rank( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}),{BUSINESS_DAY_6_MONTHS} )' for datafield in datafields_df.iloc]+\
    [f'ts_zscore( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}),{BUSINESS_DAY_6_MONTHS} )' for datafield in datafields_df.iloc]

    if len(os.listdir(DIR_SINGLE_SIGNAL+"/"))>0:
        existed = list(pd.concat([pd.read_csv(DIR_SINGLE_SIGNAL+"/"+path).drop('Unnamed: 0',axis=1) for path in os.listdir(DIR_SINGLE_SIGNAL) ]).reset_index(drop=True).regular)
        expression_list = [x for x in expression_list if x not in existed] # if there are saved data already simulated, then drop duplicated expressions

    # 3) GENERATE ALPHA LIST
    alpha_list = [utils.generate_alpha(x, region=REGION, universe=UNIVERSE,) for x in expression_list]
    print(dataset_id, len(expression_list))

    # 4) MULSISIMULATE ALPHAS
    result = []
    try:
        for x in range(0,len(alpha_list),10):
            
            temp_result = [] 
            result.append(utils.simulate_alpha_list_multi(s, alpha_list[x:x+10 if x+10 < len(alpha_list) else len(alpha_list)]))
            temp_result.append(utils.simulate_alpha_list_multi(s, alpha_list[x:x+10 if x+10 < len(alpha_list) else len(alpha_list)]))
            print(f"temp_result: {len(temp_result)} | print: {alpha_list[x]}")
            print(temp_result)
            temp_df = pd.concat([pd.DataFrame(pd.concat([pd.Series(data = [res['simulate_data']['regular']],index = ['regular']),pd.Series(res['simulate_data']['settings']),res['is_stats'].iloc[0],pd.Series(data =  res['is_tests'].result.values , index = res['is_tests'].name)])).T for res in  result[-1]]).reset_index(drop=True)
            print("temp_df")
            print(temp_df)
            
            if x == 0:
                if os.path.exists(DIR_SINGLE_SIGNAL+f"/{dataset_id}_layer1.csv"):
                    res_df = pd.read_csv(DIR_SINGLE_SIGNAL+f'/{dataset_id}_layer1.csv').drop('Unnamed: 0',axis=1)
                    res_df = pd.concat([res_df,temp_df])
                else:
                    res_df = temp_df
            else:
                res_df = pd.concat([res_df,temp_df])
            res_df.to_csv(DIR_SINGLE_SIGNAL+f'/{dataset_id}_layer1.csv')
    except:
        print(f'err:{dataset_id}')

univ1 0
news18 185


  0%|          | 0/4 [00:00<?, ?it/s]

{"detail":"You do not have permission to perform this action."}
{"detail":"You do not have permission to perform this action."}
{"detail":"You do not have permission to perform this action."}


100%|██████████| 4/4 [00:40<00:00, 10.07s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

{"detail":"You do not have permission to perform this action."}
{"detail":"You do not have permission to perform this action."}
{"detail":"You do not have permission to perform this action."}


100%|██████████| 4/4 [01:11<00:00, 17.81s/it]


temp_result: 1 | print: {'type': 'REGULAR', 'settings': {'nanHandling': 'OFF', 'instrumentType': 'EQUITY', 'delay': 1, 'universe': 'TOP3000', 'truncation': 0.08, 'unitHandling': 'VERIFY', 'pasteurization': 'ON', 'region': 'USA', 'language': 'FASTEXPR', 'decay': 0, 'neutralization': 'INDUSTRY', 'visualization': False}, 'regular': 'rank( ts_backfill(vec_avg(nws18_acb),252) )'}
[[{'alpha_id': None, 'simulate_data': {'type': 'REGULAR', 'settings': {'nanHandling': 'OFF', 'instrumentType': 'EQUITY', 'delay': 1, 'universe': 'TOP3000', 'truncation': 0.08, 'unitHandling': 'VERIFY', 'pasteurization': 'ON', 'region': 'USA', 'language': 'FASTEXPR', 'decay': 0, 'neutralization': 'INDUSTRY', 'visualization': False}, 'regular': 'rank( ts_backfill(vec_avg(nws18_acb),252) )'}, 'is_stats': None, 'pnl': None, 'stats': None, 'is_tests': None}, {'alpha_id': None, 'simulate_data': {'type': 'REGULAR', 'settings': {'nanHandling': 'OFF', 'instrumentType': 'EQUITY', 'delay': 1, 'universe': 'TOP3000', 'truncatio

  0%|          | 0/4 [00:00<?, ?it/s]

{"detail":"You do not have permission to perform this action."}
{"detail":"You do not have permission to perform this action."}
{"detail":"You do not have permission to perform this action."}


## 2. Signal Combining in Each Dataset

In [None]:
def pick_signal(dataset_id): # pick signal according to its sharpe ratio
    df = pd.read_csv(DIR_SINGLE_SIGNAL+f"/{dataset_id}_layer1.csv")
    df_cand = df[(df['CONCENTRATED_WEIGHT'] =="PASS")&(df['sharpe']!=0)].copy()
    sharpe = df_cand.sharpe
    prob = (sharpe**4).clip(upper=8)
    prob_cum = (prob/prob.sum()).cumsum()
    df_cand['prob_cum'] = prob_cum
    randnum = rand.random()
    return {
        'regular':df_cand[df_cand.prob_cum>randnum].iloc[0].regular, 
        'sharpe':df_cand[df_cand.prob_cum>randnum].iloc[0].sharpe
        }

In [None]:
for dataset_id in selected_datasets_df.sort_values('valueScore', ascending=False).id.values:
    try:
        # 1) LOAD DATAFIELDS
        datafields_df = hf.get_datafields(s, dataset_id=dataset_id).query(f"{DATA_COVERAGE} < coverage")
        
        # 2) MAKE EXPRESSION LIST
        expression_list = \
            [f'rank( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}) )' for datafield in datafields_df.iloc]+\
            [f'scale( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}) )' for datafield in datafields_df.iloc]+\
            [f'ts_skewness( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}),{BUSINESS_DAY_6_MONTHS} )' for datafield in datafields_df.iloc]+\
            [f'ts_rank( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}),{BUSINESS_DAY_6_MONTHS} )' for datafield in datafields_df.iloc]+\
            [f'ts_zscore( ts_backfill({ "vec_avg" if datafield.type == "VECTOR" else "" }({datafield.id}),{BUSINESS_DAY_1_YEAR}),{BUSINESS_DAY_6_MONTHS} )' for datafield in datafields_df.iloc]

        if len(os.listdir(DIR_SINGLE_SIGNAL+"/")) > 0:
            existed = list(pd.concat([pd.read_csv(DIR_SINGLE_SIGNAL+"/" + path).drop('Unnamed: 0', axis=1) for path in os.listdir(DIR_SINGLE_SIGNAL)]).reset_index(drop=True).regular)
            expression_list = [x for x in expression_list if x not in existed] # if there are saved data already simulated, then drop duplicated expressions

        # 3) GENERATE ALPHA LIST
        alpha_list = [utils.generate_alpha(x, region="USA", universe="TOP3000") for x in expression_list]
        print(dataset_id, len(expression_list))

        # 4) MULTISIMULATE ALPHS
        result = []
        with tqdm(total=len(alpha_list), desc=f'Simulating {dataset_id}') as pbar:
            for x in range(0, len(alpha_list), 10):
                try:
                    current_alphas = alpha_list[x:x + 10 if x + 10 < len(alpha_list) else len(alpha_list)]
                    print(f"Simulating alphas from {x} to {x + len(current_alphas) - 1}")
                    result.append(utils.simulate_alpha_list_multi(s, current_alphas))

                    temp_df = pd.concat([
                        pd.DataFrame(pd.concat([pd.Series(data=[res['simulate_data']['regular']], index=['regular']), pd.Series(res['simulate_data']['settings']), res['is_stats'].iloc[0], pd.Series(data=res['is_tests'].result.values, index=res['is_tests'].name)])).T
                        for res in result[-1]
                    ]).reset_index(drop=True)

                    print(f"result: {len(result)} | print: {current_alphas}")
                    print(result)
                    
                    if x == 0:
                        if os.path.exists(DIR_SINGLE_SIGNAL+f"/{dataset_id}_layer1.csv"):
                            res_df = pd.read_csv(DIR_SINGLE_SIGNAL+f'/{dataset_id}_layer1.csv').drop('Unnamed: 0', axis=1)
                            res_df = pd.concat([res_df, temp_df])
                        else:
                            res_df = temp_df
                    else:
                        res_df = pd.concat([res_df, temp_df])

                    res_df.to_csv(DIR_SINGLE_SIGNAL+f'/{dataset_id}_layer1.csv')
                except Exception as e:
                    print(f'Error processing alpha list segment {x} for dataset {dataset_id}: {str(e)}')
                finally:
                    pbar.update(len(current_alphas))
    except Exception as e:
        print(f'Error processing dataset {dataset_id}: {str(e)}')


In [None]:
for dataset_id in  [x[:x.index('_')] for x in os.listdir(DIR_SINGLE_SIGNAL+"/")]:

    # if layer1 simulation for dataset is not done, pass the loop
    if f"{dataset_id}_layer1.csv" not in os.listdir(DIR_SINGLE_SIGNAL+"/"):
        continue

    # if there is no meaningful signals in layer1 simulations, pass the loop
    layer1_alphas = pd.read_csv(DIR_SINGLE_SIGNAL+f"/{dataset_id}_layer1.csv")
    if len(layer1_alphas[abs(layer1_alphas.sharpe)>=1]) == 0:
        continue
    
    # 1) MAKE 500 COMVINATION EXPRESSIONS
    expression_list = []
    for i in range(500):
        signal1 = pick_signal(dataset_id)
        signal2 = pick_signal(dataset_id)
        signal_def = f"""signal1 = scale({signal1['regular']});
                        signal2 = scale({signal2['regular']});
                        """
        '' if signal1['sharpe']*signal2['sharpe']>0 else '-'

        comb_cand = [f"{'' if signal1['sharpe']*signal2['sharpe']>0 else '-'}signal1*(1+signal2)",
        f"{'' if signal1['sharpe']>0 else '-'}vector_neut(signal1,signal2)",
        f"{'' if signal1['sharpe']>0 else '-'}vector_proj(signal1,signal2)",
        f"{'' if signal1['sharpe']>0 else '-'}regression_neut(signal1,signal2)",
        f"{'' if signal1['sharpe']>0 else '-'}regression_proj(signal1,signal2)"]
        expression_list.append(signal_def+comb_cand[rand.randint(0,4)])
    expression_list = list(dict.fromkeys(expression_list))
    
    if len(os.listdir(DIR_SIGNAL_COMBINATION))>0:
        existed = list(pd.concat([pd.read_csv(DIR_SIGNAL_COMBINATION+"/"+path).drop('Unnamed: 0',axis=1) for path in os.listdir(DIR_SIGNAL_COMBINATION) ]).reset_index(drop=True).regular)
        expression_list = [x for x in expression_list if x not in existed] # if there are saved data already simulated, then drop duplicated expressions
    
    # 2) GENERATE ALPHA LIST
    alpha_list = [utils.generate_alpha(x, region=REGION, universe=UNIVERSE,) for x in expression_list]
    print(dataset_id, len(expression_list))

    # 3) MULTISIMULATE ALPHAS
    result = []
    try:
        for x in range(0,len(alpha_list),10):
        
            result.append(utils.simulate_alpha_list_multi(s, alpha_list[x:x+10 if x+10 < len(alpha_list) else len(alpha_list)]))

            temp_df = pd.concat([pd.DataFrame(pd.concat([pd.Series(data = [res['simulate_data']['regular']],index = ['regular']),pd.Series(res['simulate_data']['settings']),res['is_stats'].iloc[0],pd.Series(data =  res['is_tests'].result.values , index = res['is_tests'].name)])).T for res in  result[-1]]).reset_index(drop=True)
            
            if x == 0:
                if os.path.exists(DIR_SIGNAL_COMBINATION+f"/{dataset_id}_layer2.csv"):
                    res_df = pd.read_csv(DIR_SIGNAL_COMBINATION+f'/{dataset_id}_layer2.csv').drop('Unnamed: 0',axis=1)
                    res_df = pd.concat([res_df,temp_df])
                else:
                    res_df = temp_df
            else:
                res_df = pd.concat([res_df,temp_df])
            
            res_df.to_csv(DIR_SIGNAL_COMBINATION+f'/{dataset_id}_layer2.csv')
    except:
        print(f'err:{dataset_id}')

## 3. Neutralize Signals

In [None]:
# pick combined signal according to its sharpe ratio
def pick_signal_3():
    df = pd.read_csv(DIR_SIGNAL_COMBINATION+"/{dataset_id}_layer2.csv")
    df_cand = df[(df['CONCENTRATED_WEIGHT'] =="PASS")&(df['sharpe']!=0)].copy()
    sharpe = df_cand.sharpe
    prob = (sharpe**4).clip(upper=8)
    prob_cum = (prob/prob.sum()).cumsum()
    df_cand['prob_cum'] = prob_cum
    randnum = rand.random()
    
    return {'regular':df_cand[df_cand.prob_cum>randnum].iloc[0].regular, 'sharpe':df_cand[df_cand.prob_cum>randnum].iloc[0].sharpe}

In [None]:

# 1) LOAD GROUP DATAFIELDS
groups = list(pd.concat([hf.get_datafields(s, dataset_id=dataset_id).query(f"{DATA_COVERAGE} < coverage") for dataset_id in ['pv13','pv29','pv30']]).id)

for dataset_id in [x[:x.index('_')] for x in os.listdir('./layer2_2')]:

    # if layer2 simulation for dataset is not done, pass the loop
    if not os.path.exists(DIR_SINGLE_SIGNAL):
        continue
    
    # if there is no meaningful signals in layer2 simulations, pass the loop
    layer2_alphas = pd.read_csv(DIR_SIGNAL_COMBINATION+"/{dataset_id}_layer2.csv")
    if len(layer2_alphas[abs(layer2_alphas.sharpe)>=1]) == 0:
        continue

    # 2) MAKE 500 COMBINATION EXPRESSIONS
    expression_list = []
    for i in range(500):
        signal = pick_signal_3(dataset_id)
        signal_regular = signal['regular']
        signal_last_line = [f'alpha = '+signal_regular.split('\n')[-1]+';']
        randnum = rand.randint(0,len(groups))
        last_line = [f'{"" if signal["sharpe"]>0 else "-"}group_neutralize(alpha, densify({groups[randnum]}))' if randnum != len(groups) else 'alpha']
        expression_list.append('\n'.join(signal_regular.split('\n')[:-1] + signal_last_line + last_line))
    expression_list = list(dict.fromkeys(expression_list))
    
    if len(os.listdir(DIR_SIGNAL_NEUTRALIZATION))>0:
        existed = list(pd.concat([pd.read_csv(DIR_SIGNAL_NEUTRALIZATION+"/"+path).drop('Unnamed: 0',axis=1) for path in os.listdir(DIR_SIGNAL_NEUTRALIZATION) ]).reset_index(drop=True).regular)
        expression_list = [x for x in expression_list if x not in existed] # if there are saved data already simulated, then drop duplicated expressions
    
    # 3) GENERATE ALPHA LIST
    alpha_list = [utils.generate_alpha(x, region=REGION, universe=UNIVERSE,neutralization=["MARKET","INDUSTRY","SLOW","FAST","SLOW_AND_FAST"][rand.randint(0,4)]) for x in expression_list]
    print(dataset_id, len(expression_list))

    # 4) MULTISIMULATE ALPHAS
    result = []
    try:
        for x in range(0,len(alpha_list),10):
        
            result.append(utils.simulate_alpha_list_multi(s, alpha_list[x:x+10 if x+10 < len(alpha_list) else len(alpha_list)]))

            temp_df = pd.concat([pd.DataFrame(pd.concat([pd.Series(data = [res['simulate_data']['regular']],index = ['regular']),pd.Series(res['simulate_data']['settings']),res['is_stats'].iloc[0],pd.Series(data =  res['is_tests'].result.values , index = res['is_tests'].name)])).T for res in  result[-1]]).reset_index(drop=True)
            
            if x == 0:
                if os.path.exists(DIR_SIGNAL_NEUTRALIZATION+f"/{dataset_id}_layer3.csv"):
                    res_df = pd.read_csv(DIR_SIGNAL_NEUTRALIZATION+f'/{dataset_id}_layer3.csv').drop('Unnamed: 0',axis=1)
                    res_df = pd.concat([res_df,temp_df])
                else:
                    res_df = temp_df
            else:
                res_df = pd.concat([res_df,temp_df])
            
            res_df.to_csv(DIR_SIGNAL_NEUTRALIZATION+f'/{dataset_id}_layer3.csv')
    except:
        print(f'err:{dataset_id}')

## 4. Improvement
Find the local optimum for alphas with good performance

### 1) decrease turnover: this process improves alphas with high turnover

In [None]:
# from all layers result, find candidate alphas 
# (that pass the IS_LADDER_SHARPE, LOW_SHARPE, and CONCENTRATED_WEIGHT tests)
df = pd.concat(sum([[pd.read_csv(f'./{layer_name}_2/'+x).drop('Unnamed: 0',axis =1) for x in os.listdir(f'./{layer_name}_2/')] for layer_name in ['layer1','layer2','layer3']],[])).reset_index(drop=True)

candidate = df.query("""
IS_LADDER_SHARPE == 'PASS' &\
LOW_SHARPE == 'PASS' &\
CONCENTRATED_WEIGHT == 'PASS' 
    """, engine='python')

# 1) MAKE TURNOVER DECREASING GRID
imp_alpha_list = sum([utils.alpha_grid_decrease_turnover(s,alpha_id) for alpha_id in list(candidate.alpha_id)],[])

if os.path.exists(DIR_IMPROVEMENT+f'/improved_decrease_turnover.csv'):
    stored = pd.read_csv(DIR_IMPROVEMENT+f'/improved_decrease_turnover.csv').drop('Unnamed: 0',axis =1)
    for alpha in imp_alpha_list: 
        if len(stored[(stored.regular == alpha[0]) & (stored.neutralization == alpha[1]) & (stored.decay ==alpha[2])])>0:
            imp_alpha_list.remove(alpha) # drop duplicates

# 2) MAKE ALPHA LIST
alpha_list = [utils.generate_alpha(alpha[0], region=REGION, universe=UNIVERSE,neutralization=alpha[1],decay = alpha[2]) for alpha in imp_alpha_list]

# 3) MULTISIMULATE ALPHAS
iteration = 0
result = []
for x in range(0,len(alpha_list),10):
    iteration += 1
    try:
        result.append(utils.simulate_alpha_list_multi(s, alpha_list[x:x+10 if x+10 < len(alpha_list) else len(alpha_list)]))
        temp_df = pd.concat([\
            pd.concat([pd.DataFrame(pd.concat([pd.Series(data = [res['simulate_data']['regular']],index = ['regular']),pd.Series(res['simulate_data']['settings']),res['is_stats'].iloc[0],pd.Series(data =  res['is_tests'].result.values , index = res['is_tests'].name)])).T for res in  result[-1]]).reset_index(drop=True),\
            pd.DataFrame(data = zip(*[[imp_alpha_list[x][3] for x in range(x,x+10)],[imp_alpha_list[x][4] for x in range(x,x+10)]]),columns = ['mother_alpha','grandmother_alpha']).reset_index(drop=True)]\
            ,axis=1)
            
        if x == 0:
            if os.path.exists(DIR_IMPROVEMENT+f"/improved_decrease_turnover.csv"):
                res_df = pd.read_csv(DIR_IMPROVEMENT+f'/improved_decrease_turnover.csv').drop('Unnamed: 0',axis=1)
                res_df = pd.concat([res_df,temp_df])
            else:
                res_df = temp_df
        else:
            res_df = pd.concat([res_df,temp_df])
            
        res_df.reset_index(drop=True).to_csv(DIR_IMPROVEMENT+f'/improved_decrease_turnover.csv')
    except:
        print(f'err, iteration = {iteration}')


### 2) change time horizon: this process tests alphas using ts_oprators with different time horizon

In [None]:

# from all layers result, find candidate alphas 
# (that pass the IS_LADDER_SHARPE, LOW_SHARPE, and CONCENTRATED_WEIGHT tests)
df = pd.concat(sum([[pd.read_csv(f'./{layer_name}_2/'+x).drop('Unnamed: 0',axis =1) for x in os.listdir(f'./{layer_name}_2/')] for layer_name in ['layer1','layer2','layer3']],[])).reset_index(drop=True)

candidate = df.query("""
IS_LADDER_SHARPE == 'PASS' &\
LOW_SHARPE == 'PASS' &\
CONCENTRATED_WEIGHT == 'PASS' 
    """, engine='python')

# 1) MAKE TIME HORIZON GRID
imp_alpha_list = sum([utils.alpha_grid_time_horizon(s,alpha_id) for alpha_id in list(candidate.alpha_id)],[])
imp_alpha_list_original = imp_alpha_list.copy()
print(len(imp_alpha_list_original))

if os.path.exists(DIR_IMPROVEMENT+f'/improved_time_horizon.csv'):
    stored = pd.read_csv(DIR_IMPROVEMENT+f'/improved_time_horizon.csv').drop('Unnamed: 0',axis =1)
    for alpha in imp_alpha_list_original:  ## drop duplicates
        if len(stored[(stored.regular == alpha[0]) & (stored.neutralization == alpha[1]) & (stored.decay ==alpha[2])])>0:
            imp_alpha_list.remove(alpha) # drop duplicates

print(len(imp_alpha_list))

# 1) MAKE ALPHA LIST
alpha_list = [utils.generate_alpha(alpha[0], region=REGION, universe=UNIVERSE,neutralization=alpha[1],decay = alpha[2]) for alpha in imp_alpha_list]

# 2) MULTISIMULATE ALPHAS
iteration = 0
result = []
for x in range(0,len(alpha_list),10):
    iteration += 1
    try:
        result.append(utils.simulate_alpha_list_multi(s, alpha_list[x:x+10 if x+10 < len(alpha_list) else len(alpha_list)]))
        temp_df = pd.concat([\
            pd.concat([pd.DataFrame(pd.concat([pd.Series(data = [res['simulate_data']['regular']],index = ['regular']),pd.Series(res['simulate_data']['settings']),res['is_stats'].iloc[0],pd.Series(data =  res['is_tests'].result.values , index = res['is_tests'].name)])).T for res in  result[-1]]).reset_index(drop=True),\
            pd.DataFrame(data = zip(*[[imp_alpha_list[x][3] for x in range(x,x+10)],[imp_alpha_list[x][4] for x in range(x,x+10)]]),columns = ['mother_alpha','grandmother_alpha']).reset_index(drop=True)]\
            ,axis=1)
            
        if x == 0:
            if os.path.exists(DIR_IMPROVEMENT+f"/improved_time_horizon.csv"):
                res_df = pd.read_csv(DIR_IMPROVEMENT+f'/improved_time_horizon.csv').drop('Unnamed: 0',axis=1)
                res_df = pd.concat([res_df,temp_df])
            else:
                res_df = temp_df
        else:
            res_df = pd.concat([res_df,temp_df])
            
        res_df.reset_index(drop=True).to_csv(DIR_IMPROVEMENT+f'/improved_time_horizon.csv')
    except:
        print(f'err, iteration = {iteration}')

## 5. Check production correlation
check production correlation to verify that the alpha is submittable.

In [None]:
# get all alphas from all single alpha directories
total_df = pd.concat(sum([[pd.read_csv(f'./{folder_name}/{file_name}').drop('Unnamed: 0',axis =1) for file_name in os.listdir(f'./{folder_name}/')] for folder_name in ['1_SINGLE_SIGNAL','layer2_2','layer3_2','improved']],[])).reset_index(drop=True)

# 1) FILTER ALPHAS WHICH PASSED ALL TESTS
all_pass_alphas = total_df.query("""
LOW_SHARPE == 'PASS' &\
LOW_FITNESS == 'PASS' &\
LOW_TURNOVER == 'PASS' &\
HIGH_TURNOVER == 'PASS' &\
CONCENTRATED_WEIGHT == 'PASS' &\
IS_LADDER_SHARPE == 'PASS' &\
LOW_SUB_UNIVERSE_SHARPE == 'PASS'
    """, engine='python').copy().reset_index(drop=True)

# 2) GET INFORMATION FROM API AND SAVE IT INTO MAX_CORREL COLUMN AND HIGH_CORRL_ALPHAS COLUMN 
all_pass_alphas['max_correl'] = ['not_calculated']*len(all_pass_alphas)
all_pass_alphas['high_correl_alphas'] = ['not_calculated']*len(all_pass_alphas)
max_correls = []
for i in tqdm(range(len(all_pass_alphas))):
    try:
        max_co = hf.get_max_prod_corr(s,all_pass_alphas.iloc[i]['alpha_id'])
        all_pass_alphas.loc[i,'max_correl'] = max_co[0]
        all_pass_alphas.loc[i,'high_correl_alphas'] = max_co[1]
    except:
        continue

all_pass_alphas