In [16]:
import pandas as pd
import io

In [17]:
def _process_data_(file):
    with open(file, 'r') as file:
        log_content = file.read()
    sections = log_content.split('Sandbox logs:')[1].split('Activities log:')
    sandbox_log =  sections[0].strip()
    activities_log = sections[1].split('Trade History:')[0]
    # sandbox_log_list = [json.loads(line) for line in sandbox_log.split('\n')]
    trade_history =  json.loads(sections[1].split('Trade History:')[1])
    # sandbox_log_df = pd.DataFrame(sandbox_log_list)
    market_data_df = pd.read_csv(io.StringIO(activities_log), sep=";", header=0)
    trade_history_df = pd.json_normalize(trade_history)
    return market_data_df, trade_history_df

i = 1
df_24, _ = _process_data_(f'2024_data_logs/round_{i}.log')
df_24 = df_24.pivot(columns='product', values='mid_price', index='timestamp').reset_index()

j = 2
df_23 = pd.read_csv(f"2023_data_logs/r{j}.csv", sep=';')

df_23 = df_23.pivot(columns='product', values='mid_price', index='timestamp').reset_index()

df_23.columns = [df_23.columns[0]] + [col + '_past' for col in df_23.columns[1:]]

df_24.columns  = [df_24.columns[0]] + [col + '_curr' for col in df_24.columns[1:]]

df_test = df_24.merge(df_23, on='timestamp', how='inner')

In [18]:
df_test.columns

Index(['timestamp', 'AMETHYSTS_curr', 'STARFRUIT_curr', 'BANANAS_past',
       'COCONUTS_past', 'PEARLS_past', 'PINA_COLADAS_past'],
      dtype='object')

In [19]:
def get_prev_returns(df, col, its):
    prev_col = f"{col}_prev_{its}_its"
    df[prev_col] = df[col].shift(its)
    df[f"{col}_returns_from_{its}_its_ago"] = (df[col] - df[prev_col]) / df[prev_col]
    df.drop(columns=[prev_col], inplace=True)
    return df

def get_future_returns(df, col, its):
    future_col = f"{col}_future_{its}_its"
    df[future_col] = df[col].shift(-its)
    df[f"{col}_returns_in_{its}_its"] = (df[future_col] - df[col]) / df[col]
    df.drop(columns=[future_col], inplace=True)
    return df

def get_centered_returns(df, col, its):
    future_col = f"{col}_future_{its}_its"
    df[future_col] = df[col].shift(-its)
    prev_col = f"{col}_prev_{its}_its"
    df[prev_col] = df[col].shift(its)
    df[f"{col}_returns_centered_with_{its}_its"] = (df[future_col] - df[prev_col])/df[prev_col]
    df.drop(columns=[prev_col], inplace=True)
    df.drop(columns=[future_col], inplace=True)
    return df

In [20]:
import os
import json
import io
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from tqdm import tqdm

def _process_data_(file):
    with open(file, 'r') as file:
        log_content = file.read()
        sections = log_content.split('Sandbox logs:')[1].split('Activities log:')
        sandbox_log = sections[0].strip()
        activities_log = sections[1].split('Trade History:')[0]
        trade_history = json.loads(sections[1].split('Trade History:')[1])
        market_data_df = pd.read_csv(io.StringIO(activities_log), sep=";", header=0)
        trade_history_df = pd.json_normalize(trade_history)
        return market_data_df, trade_history_df

def get_future_returns(df, col, its):
    future_col = f"{col}_future_{its}_its"
    df[future_col] = df[col].shift(-its)
    df[f"{col}_returns_in_{its}_its"] = (df[future_col] - df[col]) / df[col]
    df.drop(columns=[future_col], inplace=True)
    return df

predictor_timeframes = [1]
responder_timeframes = [1]

results = []

for i in tqdm(range(1, 5)):
    for j in range(2, 6):
        df_24, _ = _process_data_(f'2024_data_logs/round_{i}.log')
        df_24 = df_24.pivot(columns='product', values='mid_price', index='timestamp').reset_index()
        
        df_23 = pd.read_csv(f"2023_data_logs/r{j}.csv", sep=';')
        df_23 = df_23.pivot(columns='product', values='mid_price', index='timestamp').reset_index()
        
        df_23.columns = [df_23.columns[0]] + [col + '_past' for col in df_23.columns[1:]]
        df_24.columns = [df_24.columns[0]] + [col + '_curr' for col in df_24.columns[1:]]
        
        df_test = df_24.merge(df_23, on='timestamp', how='inner')
        
        predictor_symbols = [col for col in df_test.columns if col.endswith('_past')]
        responder_symbols = [col for col in df_test.columns if col.endswith('_curr')]
        
        df_copy = df_test.copy()
        
        for responder_timeframe in responder_timeframes:
            for symbol in responder_symbols:
                df_copy = get_future_returns(df_copy, symbol, responder_timeframe)
            
            for predictor_timeframe in predictor_timeframes:
                for symbol in predictor_symbols:
                    df_copy = get_future_returns(df_copy, symbol, predictor_timeframe)
                
                for predictor_symbol in predictor_symbols:
                    for responder_symbol in responder_symbols:
                        feature_col = [col for col in df_copy.columns if col.endswith(f"_returns_in_{predictor_timeframe}_its") and col.startswith(predictor_symbol)]
                        target_col = f"{responder_symbol}_returns_in_{responder_timeframe}_its"
                        
                        df_train = df_copy[feature_col + [target_col]].dropna()
                        
                        X = df_train[feature_col]
                        y = df_train[target_col]
                        
                        model = LinearRegression(fit_intercept=False)
                        model.fit(X, y)
                        
                        y_pred = model.predict(X)
                        
                        r2 = r2_score(y, y_pred)
                        _, p_value = stats.pearsonr(y, y_pred)
                        
                        results.append({
                            '2024_day': i,
                            '2023_day': j,
                            'predictor_symbol': predictor_symbol,
                            'responder_symbol': responder_symbol,
                            'r_squared': r2,
                            'p_value': p_value,
                            'equation': f"{target_col} = {model.coef_[0]:.4f} * {feature_col[0]}"
                        })
                
                future_cols_predictor = [col for col in df_copy.columns if col.endswith(f"_returns_in_{predictor_timeframe}_its")]
                df_copy.drop(columns=future_cols_predictor, inplace=True)
            
            future_cols_responder = [col for col in df_copy.columns if col.endswith(f"_returns_in_{responder_timeframe}_its")]
            df_copy.drop(columns=future_cols_responder, inplace=True)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('r_squared', ascending=False)

print(results_df)

100%|███████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.77s/it]

     2024_day  2023_day  predictor_symbol responder_symbol  r_squared  \
558         4         4       PEARLS_past   AMETHYSTS_curr   1.000000   
228         3         3       PEARLS_past   AMETHYSTS_curr   0.999646   
72          2         2       PEARLS_past   AMETHYSTS_curr   0.999059   
546         4         4  DIVING_GEAR_past       ROSES_curr   0.988045   
218         3         3  DIVING_GEAR_past       ROSES_curr   0.983762   
..        ...       ...               ...              ...        ...   
253         3         4      BANANAS_past       ROSES_curr  -0.000683   
358         3         5  DIVING_GEAR_past       ROSES_curr  -0.000683   
176         3         2     COCONUTS_past       ROSES_curr  -0.000684   
281         3         4  DIVING_GEAR_past       ROSES_curr  -0.000684   
183         3         2       PEARLS_past       ROSES_curr  -0.000685   

      p_value                                           equation  
558  0.000000  AMETHYSTS_curr_returns_in_1_its = 1.0000 




In [9]:
results_df.head(20)

Unnamed: 0,2024_day,2023_day,predictor_symbol,responder_symbol,r_squared,p_value,equation
558,4,4,PEARLS_past,AMETHYSTS_curr,1.0,0.0,AMETHYSTS_curr_returns_in_1_its = 1.0000 * PEA...
228,3,3,PEARLS_past,AMETHYSTS_curr,0.999646,0.0,AMETHYSTS_curr_returns_in_1_its = 0.9997 * PEA...
72,2,2,PEARLS_past,AMETHYSTS_curr,0.999059,0.0,AMETHYSTS_curr_returns_in_1_its = 0.9996 * PEA...
546,4,4,DIVING_GEAR_past,ROSES_curr,0.988045,0.0,ROSES_curr_returns_in_1_its = 3.0775 * DIVING_...
218,3,3,DIVING_GEAR_past,ROSES_curr,0.983762,0.0,ROSES_curr_returns_in_1_its = 3.0400 * DIVING_...
407,4,2,COCONUTS_past,COCONUT_curr,0.766018,0.0,COCONUT_curr_returns_in_1_its = 0.8045 * COCON...
425,4,2,PINA_COLADAS_past,COCONUT_curr,0.471611,0.0,COCONUT_curr_returns_in_1_its = 0.4974 * PINA_...
408,4,2,COCONUTS_past,COCONUT_COUPON_curr,0.146582,0.0,COCONUT_COUPON_curr_returns_in_1_its = 6.8614 ...
544,4,4,DIVING_GEAR_past,GIFT_BASKET_curr,0.122258,1.350168e-285,GIFT_BASKET_curr_returns_in_1_its = 0.6514 * D...
216,3,3,DIVING_GEAR_past,GIFT_BASKET_curr,0.107882,3.12901e-250,GIFT_BASKET_curr_returns_in_1_its = 0.6122 * D...


In [13]:
import os
import json
import io
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from tqdm import tqdm

def _process_data_(file):
    with open(file, 'r') as file:
        log_content = file.read()
        sections = log_content.split('Sandbox logs:')[1].split('Activities log:')
        sandbox_log = sections[0].strip()
        activities_log = sections[1].split('Trade History:')[0]
        trade_history = json.loads(sections[1].split('Trade History:')[1])
        market_data_df = pd.read_csv(io.StringIO(activities_log), sep=";", header=0)
        trade_history_df = pd.json_normalize(trade_history)
        return market_data_df, trade_history_df


predictor_timeframes = [1]
responder_timeframes = [1]

results = []

for i in tqdm(range(1, 5)):
    for j in range(2, 6):
        df_24, _ = _process_data_(f'2024_data_logs/round_{i}.log')
        df_24 = df_24.pivot(columns='product', values='mid_price', index='timestamp').reset_index()
        
        df_23 = pd.read_csv(f"2023_data_logs/r{j}.csv", sep=';')
        df_23 = df_23.pivot(columns='product', values='mid_price', index='timestamp').reset_index()
        
        df_23.columns = [df_23.columns[0]] + [col + '_past' for col in df_23.columns[1:]]
        df_24.columns = [df_24.columns[0]] + [col + '_curr' for col in df_24.columns[1:]]
        
        df_test = df_24.merge(df_23, on='timestamp', how='inner')
        
        predictor_symbols = [col for col in df_test.columns if col.endswith('_past')]
        responder_symbols = [col for col in df_test.columns if col.endswith('_curr')]
        
        df_copy = df_test.copy()
        
        for responder_timeframe in responder_timeframes:
            for symbol in responder_symbols:
                df_copy = get_future_returns(df_copy, symbol, responder_timeframe)
            
            for predictor_timeframe in predictor_timeframes:
                for symbol in predictor_symbols:
                    df_copy = get_future_returns(df_copy, symbol, predictor_timeframe)
                
                for predictor_symbol in predictor_symbols:
                    for responder_symbol in responder_symbols:
                        feature_col = [col for col in df_copy.columns if col.endswith(f"_returns_in_{predictor_timeframe}_its") and col.startswith(predictor_symbol)]
                        target_col = f"{responder_symbol}_returns_in_{responder_timeframe}_its"
                        
                        df_train = df_copy[feature_col + [target_col]].dropna()
                        
                        X = df_train[feature_col]
                        y = df_train[target_col]
                        
                        model = LinearRegression(fit_intercept=False)
                        model.fit(X, y)
                        
                        y_pred = model.predict(X)
                        
                        r2 = r2_score(y, y_pred)
                        _, p_value = stats.pearsonr(y, y_pred)
                        
                        results.append({
                            '2024_day': i,
                            '2023_day': j,
                            'predictor_symbol': predictor_symbol,
                            'responder_symbol': responder_symbol,
                            'r_squared': r2,
                            'p_value': p_value,
                            'equation': f"{target_col} = {model.coef_[0]:.4f} * {feature_col[0]}"
                        })
                
                future_cols_predictor = [col for col in df_copy.columns if col.endswith(f"_returns_in_{predictor_timeframe}_its")]
                df_copy.drop(columns=future_cols_predictor, inplace=True)
            
            future_cols_responder = [col for col in df_copy.columns if col.endswith(f"_returns_in_{responder_timeframe}_its")]
            df_copy.drop(columns=future_cols_responder, inplace=True)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('r_squared', ascending=False)

print(results_df)

100%|███████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.63s/it]

     2024_day  2023_day  predictor_symbol responder_symbol  r_squared  \
558         4         4       PEARLS_past   AMETHYSTS_curr   1.000000   
228         3         3       PEARLS_past   AMETHYSTS_curr   0.999646   
72          2         2       PEARLS_past   AMETHYSTS_curr   0.999059   
546         4         4  DIVING_GEAR_past       ROSES_curr   0.988045   
218         3         3  DIVING_GEAR_past       ROSES_curr   0.983762   
..        ...       ...               ...              ...        ...   
253         3         4      BANANAS_past       ROSES_curr  -0.000683   
358         3         5  DIVING_GEAR_past       ROSES_curr  -0.000683   
176         3         2     COCONUTS_past       ROSES_curr  -0.000684   
281         3         4  DIVING_GEAR_past       ROSES_curr  -0.000684   
183         3         2       PEARLS_past       ROSES_curr  -0.000685   

      p_value                                           equation  
558  0.000000  AMETHYSTS_curr_returns_in_1_its = 1.0000 




In [14]:
import os
import json
import io
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from tqdm import tqdm

def _process_data_(file):
    with open(file, 'r') as file:
        log_content = file.read()
        sections = log_content.split('Sandbox logs:')[1].split('Activities log:')
        sandbox_log = sections[0].strip()
        activities_log = sections[1].split('Trade History:')[0]
        trade_history = json.loads(sections[1].split('Trade History:')[1])
        market_data_df = pd.read_csv(io.StringIO(activities_log), sep=";", header=0)
        trade_history_df = pd.json_normalize(trade_history)
        return market_data_df, trade_history_df

results = []

for i in tqdm(range(1, 5)):
    for j in range(2, 6):
        df_24, _ = _process_data_(f'2024_data_logs/round_{i}.log')
        df_24 = df_24.pivot(columns='product', values='mid_price', index='timestamp').reset_index()
        
        df_23 = pd.read_csv(f"2023_data_logs/r{j}.csv", sep=';')
        df_23 = df_23.pivot(columns='product', values='mid_price', index='timestamp').reset_index()
        
        df_23.columns = [df_23.columns[0]] + [col + '_past' for col in df_23.columns[1:]]
        df_24.columns = [df_24.columns[0]] + [col + '_curr' for col in df_24.columns[1:]]
        
        df_test = df_24.merge(df_23, on='timestamp', how='inner')
        
        predictor_symbols = [col for col in df_test.columns if col.endswith('_past')]
        responder_symbols = [col for col in df_test.columns if col.endswith('_curr')]
        
        for predictor_symbol in predictor_symbols:
            for responder_symbol in responder_symbols:
                X = df_test[[predictor_symbol]]
                y = df_test[responder_symbol]
                
                for fit_intercept in [True, False]:
                    model = LinearRegression(fit_intercept=fit_intercept)
                    model.fit(X, y)
                    
                    y_pred = model.predict(X)
                    
                    r2 = r2_score(y, y_pred)
                    _, p_value = stats.pearsonr(y, y_pred)
                    
                    if fit_intercept:
                        equation = f"{responder_symbol} = {model.intercept_:.4f} + {model.coef_[0]:.4f} * {predictor_symbol}"
                    else:
                        equation = f"{responder_symbol} = {model.coef_[0]:.4f} * {predictor_symbol}"
                    
                    results.append({
                        '2024_day': i,
                        '2023_day': j,
                        'predictor_symbol': predictor_symbol,
                        'responder_symbol': responder_symbol,
                        'fit_intercept': fit_intercept,
                        'r_squared': r2,
                        'p_value': p_value,
                        'equation': equation
                    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('r_squared', ascending=False)

print(results_df)

100%|███████████████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.21s/it]

      2024_day  2023_day        predictor_symbol responder_symbol  \
1117         4         4             PEARLS_past   AMETHYSTS_curr   
1116         4         4             PEARLS_past   AMETHYSTS_curr   
144          2         2             PEARLS_past   AMETHYSTS_curr   
145          2         2             PEARLS_past   AMETHYSTS_curr   
456          3         3             PEARLS_past   AMETHYSTS_curr   
...        ...       ...                     ...              ...   
265          2         5           BAGUETTE_past   AMETHYSTS_curr   
33           1         3  DOLPHIN_SIGHTINGS_past   AMETHYSTS_curr   
937          4         3  DOLPHIN_SIGHTINGS_past   AMETHYSTS_curr   
443          3         3  DOLPHIN_SIGHTINGS_past   AMETHYSTS_curr   
181          2         3  DOLPHIN_SIGHTINGS_past   AMETHYSTS_curr   

      fit_intercept    r_squared   p_value  \
1117          False     1.000000  0.000000   
1116           True     1.000000  0.000000   
144            True     0.999064 




In [15]:
results_df.head(20)

Unnamed: 0,2024_day,2023_day,predictor_symbol,responder_symbol,fit_intercept,r_squared,p_value,equation
1117,4,4,PEARLS_past,AMETHYSTS_curr,False,1.0,0.0,AMETHYSTS_curr = 1.0000 * PEARLS_past
1116,4,4,PEARLS_past,AMETHYSTS_curr,True,1.0,0.0,AMETHYSTS_curr = 0.0000 + 1.0000 * PEARLS_past
144,2,2,PEARLS_past,AMETHYSTS_curr,True,0.999064,0.0,AMETHYSTS_curr = 1.4034 + 0.9999 * PEARLS_past
145,2,2,PEARLS_past,AMETHYSTS_curr,False,0.999064,0.0,AMETHYSTS_curr = 1.0000 * PEARLS_past
456,3,3,PEARLS_past,AMETHYSTS_curr,True,0.998763,0.0,AMETHYSTS_curr = 11.7088 + 0.9988 * PEARLS_past
457,3,3,PEARLS_past,AMETHYSTS_curr,False,0.998762,0.0,AMETHYSTS_curr = 1.0000 * PEARLS_past
1092,4,4,DIVING_GEAR_past,ROSES_curr,True,0.99403,0.0,ROSES_curr = -26502.2602 + 0.4009 * DIVING_GEA...
814,4,2,COCONUTS_past,COCONUT_curr,True,0.986476,0.0,COCONUT_curr = -45.8210 + 1.2526 * COCONUTS_past
815,4,2,COCONUTS_past,COCONUT_curr,False,0.986455,0.0,COCONUT_curr = 1.2468 * COCONUTS_past
452,3,3,DOLPHIN_SIGHTINGS_past,STARFRUIT_curr,True,0.862765,0.0,STARFRUIT_curr = 7433.9116 + -0.7928 * DOLPHIN...


In [21]:
df

NameError: name 'df' is not defined