In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
from joblib import parallel_backend

In [115]:
data_path = Path(r'../data/').glob('*.json')

In [116]:
def dataframe_actions(comparison_list):
    df = pd.DataFrame(comparison_list)
    df['is_neg'] = np.where(df['cor_coef'] <= 0, 1, 0)
    df['is_stat_sig'] = np.where(df['p-val'] < 0.05, 1, 0)
    
    df['abs_stock'] = np.absolute(df['cor_coef'])
    df_pos = df[df['is_neg'] == 0]
    df_neg = df[df['is_neg'] == 1]

    # get de-correlated
    if df[df['is_stat_sig'] == 1]['is_stat_sig'].sum() > 10:
        dec_vals = df[df['is_stat_sig'] == 1].sort_values('abs_stock', ascending=True).drop(['is_neg', 'abs_stock'], axis=1)
    else:
        dec_vals = df.sort_values(['is_stat_sig', 'abs_stock'], ascending=True).drop(['is_neg', 'abs_stock'], axis=1)
    dec_dict = dec_vals[:10].to_dict(orient='records')

    # get positives
    if df_pos[df_pos['is_stat_sig'] == 1]['is_stat_sig'].sum() > 30:
        pos_vals = df_pos[df_pos['is_stat_sig'] == 1].sort_values('cor_coef', ascending=False).drop(['is_neg', 'abs_stock'], axis=1)
    else:   
        pos_vals = df_pos.sort_values(['is_stat_sig', 'cor_coef'], ascending=False).drop(['is_neg', 'abs_stock'], axis=1)
    pos_dict = pos_vals[:50].to_dict(orient='records')
    
    # get negatives
    if df_neg[df_neg['is_stat_sig'] == 1]['is_stat_sig'].sum() > 30:
        neg_vals = df_neg[df_neg['is_stat_sig'] == 1].sort_values('cor_coef', ascending=True).drop(['is_neg', 'abs_stock'], axis=1)
    else:   
        neg_vals = df_neg.sort_values(['is_stat_sig', 'cor_coef'], ascending=False).drop(['is_neg', 'abs_stock'], axis=1)
    neg_dict = neg_vals[:50].to_dict(orient='records')    

    # get closest to zero


    return pos_dict, neg_dict, dec_dict
        
        

In [117]:
test_path = Path(r'../data-test/').glob('*.json')

In [118]:
for i in test_path:
    comparison_list = []
    with open(i, 'r') as json_file:
        data = json.load(json_file)
        for j in data[i.stem]['comparisons']:
            comparison_list.append(
                {'name': data[i.stem]['comparisons'][j]['referencing'], 'cor_coef': data[i.stem]['comparisons'][j]['cor_coef'], 'p-val': data[i.stem]['comparisons'][j]['p-val']}
                )
        pos_dict, neg_dict, dec_dict = dataframe_actions(comparison_list)
        data[i.stem].pop('comparisons')
        data[i.stem]['pos_vals'] = pos_dict
        data[i.stem]['neg_vals'] = neg_dict
        data[i.stem]['decorrelated_vals'] = dec_dict
        json_file.close()
    
    with open('../data-processed' + i.name, 'w') as json_file:
        json.dump(data, json_file)    
        json_file.close()
    
    
    