In [None]:
import pickle
import pandas as pd
import gc
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def r2_metric(y_true, y_pred, weights=None):
    """Calculate weighted R2 score"""
    y_true = y_true.ravel()
    y_pred = y_pred.ravel()

    # If weights is None, use uniform weights
    if weights is None:
        weights = np.ones_like(y_true)
    else:
        weights = weights.ravel()

    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return 'r2', r2_score, True

### 기본 모델 
- Partition6 | 1_000_000~5_000_000 학습
- 5_000_000 ~ 6_000_000 Inference

전체 feature 사용함. 

In [None]:
og_model_path = "experiment_models/og_model_0.pkl"
file_path = "jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet"
TRAIN_START = 1000000
TRAIN_END = 2000000
TEST_START = 2000000
TEST_END = 3000000

with open(og_model_path, 'rb') as f:
    og_model = pickle.load(f)
    
og_df = pd.read_parquet(file_path)
og_feature_col = og_df.columns[og_df.columns.str.contains("feature")]
og_target_col = "responder_6"
og_weights = og_df.iloc[TEST_START:TEST_END]['weight']
og_X = og_df.iloc[TEST_START:TEST_END][og_feature_col]
og_y = og_df.iloc[TEST_START:TEST_END][og_target_col]

In [None]:
og_pred = og_model.predict(og_X)

In [None]:
r2_metric(og_y, og_pred, og_weights)

#### ('r2', np.float64(0.05170168967069455), True)


### Selected Model 
- Partition6 | 1_000_000~2_000_000 학습
- 2_000_000 ~ 3_000_000 Inference

1. 모든 partition에서 모든 feature간 corr 계산
2. 모든 feature에서 corr이 0.4가 넘은 feature filtering 
3. 서로 관계가 있는 feature끼리 Clustering
4. Cluster의 가장 숫자가 낮은 feature를 대표 feature로 선정
5. Cluster에 속한 나머지 feature drop

- feature개수 31개 감소, 대표 feature 14개
- clustering 방법: 서로 연관된 feature끼리 전부 묶음 (a - b & b - c -> a - b -c)

In [None]:
selected_model_path = "experiment_models/selected_feature_model_1.pkl"


TEST_FILE_PATH = "eda_chunks/final_rolling_0002.parquet"

with open(selected_model_path, 'rb') as f:
    selected_model = pickle.load(f)
    
selected_df = pd.read_parquet(TEST_FILE_PATH)
selected_feature_col = selected_df.columns[(selected_df.columns.str.contains("feature")) & (~selected_df.columns.str.contains("rolling"))]
selected_target_col = "responder_6"
selected_weights = selected_df['weight']
selected_X = selected_df[selected_feature_col]
selected_y = selected_df[selected_target_col]

In [None]:
selected_pred = selected_model.predict(selected_X)
r2_metric(selected_y, selected_pred, selected_weights)

#### ('r2', np.float64(-0.015881716970791437), True)

### Selected Rolling Model 
- Partition6 | 1_000_000~2_000_000 학습
- 2_000_000 ~ 3_000_000 Inference

- 각 symbol 별로 Moving Average를 진행함. 
- Feature는 대표 feature만 추가해서 진행 

In [None]:
import pyarrow.parquet as pq


selected_rolling_model_path = "experiment_models/selected_feature_rolling_model_1.pkl"
TRAIN_START_PARTITION = 1 # 1m ~ 2m
TRAIN_END_PARTITION = 4 # 4m ~ 5m -> 1m~5m

TEST_FILE_PATH = "eda_chunks/final_rolling_0001.parquet"

with open(selected_rolling_model_path, 'rb') as f:
    selected_rolling_model = pickle.load(f)
    
columns = pq.read_schema(f'eda_chunks/final_rolling_{str(1).zfill(4)}.parquet').names
columns = pd.Index(columns)  # pandas Index로 변환하여 str.contains 사용 가능하게 함

# 필요한 컬럼을 선택합니다
not_rolling_feature_col = columns[
        (columns.str.contains("feature")) & 
        (~columns.str.contains("rolling"))
    ].to_list()

rolling_feature_col = columns[
        (columns.str.contains("rolling")) & 
        (~columns.str.contains("rolling_2")) & 
        (~columns.str.contains("rolling_30")) & 
        (~columns.str.contains("rolling_1000"))
    ].to_list()

selected_columns = not_rolling_feature_col + rolling_feature_col + ['responder_6'] + ['weight']
selected_rolling_df = pd.read_parquet(f'eda_chunks/final_rolling_{str(2).zfill(4)}.parquet', columns=selected_columns)


selected_rolling_target_col = "responder_6"
selected_rolling_weights = selected_rolling_df['weight']
selected_rolling_X = selected_rolling_df[not_rolling_feature_col + rolling_feature_col]
selected_rolling_y = selected_rolling_df[selected_rolling_target_col]

In [None]:
selected_rolling_pred = selected_rolling_model.predict(selected_rolling_X[:200000])
r2_metric(selected_rolling_y[:200000], selected_rolling_pred[:200000], selected_rolling_weights[:200000])

#### ('r2', np.float64(0.08337754568080169), True)