In [3]:
import pandas as pd
from lightgbm import LGBMRegressor, LGBMClassifier
import pyarrow.parquet as pq

In [4]:
import pickle
import os
from datetime import datetime

def save_model(model, model_name, partition_num=0, save_dir="models", use_pickle=True):
    """
    학습된 모델을 저장하는 함수
    
    Parameters:
    -----------
    model : LGBMRegressor
        저장할 모델
    model_name : str
        모델 이름
    save_dir : str
        저장할 디렉토리 경로
    use_pickle : bool
        True면 pickle 형식으로, False면 LightGBM 형식으로 저장
    
    Returns:
    --------
    str : 저장된 모델의 전체 경로
    """
    # 저장 디렉토리가 없으면 생성
    os.makedirs(save_dir, exist_ok=True)
        
    if use_pickle:
        # Pickle 형식으로 저장
        file_path = os.path.join(save_dir, f"{model_name}_{partition_num}.pkl")
        with open(file_path, 'wb') as f:
            pickle.dump(model, f)
    else:
        # LightGBM 형식으로 저장
        file_path = os.path.join(save_dir, f"{model_name}_{partition_num}.txt")
        model.booster_.save_model(file_path)
    
    print(f"Model saved to: {file_path}")
    return file_path

In [5]:
import gc
import lightgbm as lgb

def og_train(start_idx, end_idx, data_path, verbose=0, model=None, first_iteration=True):
    df = pd.read_parquet(data_path)
    df = df[start_idx:end_idx]
    feature_col = df.columns[df.columns.str.contains("feature")]
    X = df[feature_col]
    y = df['responder_6']
    
    if first_iteration:
        lgbm_model = LGBMRegressor(random_state=42, verbose=verbose)
        lgbm_model.fit(X, y)
    else:
        booster = model.booster_
        train_data = lgb.Dataset(X, y)
        booster = lgb.train(
            params=model.get_params(),
            train_set=train_data,
            num_boost_round=model.get_params().get('n_estimators', 100),
            init_model=booster
        )
        lgbm_model = LGBMRegressor(**model.get_params())
        lgbm_model._Booster = booster
        lgbm_model._objective = booster.params['objective']
        lgbm_model._n_features = booster.num_feature()
        lgbm_model._n_classes = 1
        lgbm_model._classes = None
        lgbm_model._n_features_in = booster.num_feature()
        lgbm_model._features_in = None

    return lgbm_model

def selected_feature_train(partition, chunk_size=10000, verbose=0, save_model_path=None, model_path=None, first_iteration=True, **model_params):
    # 기본 파라미터 설정
    default_params = {
        'random_state': 42,
        'verbose': verbose,
        'objective': 'regression',
        'metric': 'rmse'
    }
    
    if first_iteration:
        default_params.update(model_params)
        lgbm_model = LGBMRegressor(**default_params)
        booster = None
    else:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"Model loaded from {model_path}")
        
        lgbm_model = model
        booster = model._Booster 
        default_params = model.get_params()
    
    # 데이터 로드
    df = pd.read_parquet(f'eda_chunks/final_rolling_{str(partition).zfill(4)}.parquet')
    not_rolling_feature_col = df.columns[(df.columns.str.contains("feature")) & (~df.columns.str.contains("rolling"))].to_list()
    X = df[not_rolling_feature_col]
    y = df['responder_6']
    
    train_params = default_params.copy()
    if 'importance_type' in train_params:
        train_params.pop('importance_type')
    
    # 청크 단위로 학습
    total_rows = len(df)
    for chunk_start in range(0, total_rows, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_rows)
        X_chunk = X.iloc[chunk_start:chunk_end]
        y_chunk = y.iloc[chunk_start:chunk_end]
        
        # 학습 데이터셋 생성
        train_data = lgb.Dataset(X_chunk, y_chunk)
        
        if first_iteration:
            # 첫 번째 학습
            lgbm_model.fit(X_chunk, y_chunk)
            booster = lgbm_model._Booster 
            first_iteration = False
        # else:
        #     # 증분 학습
        #     booster = lgb.train(
        #         params=train_params,
        #         train_set=train_data,
        #         num_boost_round=default_params.get('n_estimators', 100),
        #         init_model=booster
        #     )
        #     # 새로운 모델 생성 및 booster 적용
        #     lgbm_model = LGBMRegressor(**default_params)
        #     lgbm_model._Booster = booster
        #     lgbm_model._objective = booster.params['objective']
        #     lgbm_model._n_features = booster.num_feature()
        #     lgbm_model._n_classes = 1
        #     lgbm_model._classes = None
        #     lgbm_model._n_features_in = booster.num_feature()
        #     lgbm_model._features_in = None
    
        # del X_chunk, y_chunk
        # gc.collect()
            
    
    # 메모리 정리
    del df, X, y
    gc.collect()
    
    return lgbm_model

def selected_feature_rolling_train(partition, chunk_size=10000, verbose=0, save_model_path=None, model_path=None, first_iteration=True, **model_params):
    """
    연속적인 학습을 수행하는 함수
    """
    # 기본 파라미터 설정
    default_params = {
        'random_state': 42,
        'n_estimators': 100,
        'verbose': verbose
    }
    
    if first_iteration:
        default_params.update(model_params)
        lgbm_model = LGBMRegressor(**default_params)
        booster = None
    else:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"Model loaded from {model_path}")        

        lgbm_model = model

        default_params = model.get_params()
        
    # 컬럼명만 먼저 가져옵니다
    columns = pq.read_schema(f'eda_chunks/final_rolling_{str(partition).zfill(4)}.parquet').names
    columns = pd.Index(columns)  # pandas Index로 변환하여 str.contains 사용 가능하게 함

    # 필요한 컬럼을 선택합니다
    not_rolling_feature_col = columns[
        (columns.str.contains("feature")) & 
        (~columns.str.contains("rolling"))
    ].to_list()

    rolling_feature_col = columns[
        (columns.str.contains("rolling")) & 
        (~columns.str.contains("rolling_2")) & 
        (~columns.str.contains("rolling_30")) & 
        (~columns.str.contains("rolling_1000"))
    ].to_list()

    # 필요한 컬럼만 읽습니다
    selected_columns = not_rolling_feature_col + rolling_feature_col + ['responder_6']
    df = pd.read_parquet(f'eda_chunks/final_rolling_{str(partition).zfill(4)}.parquet', columns=selected_columns)

    # X와 y 분리
    X = df[not_rolling_feature_col + rolling_feature_col]
    y = df['responder_6']
    
    del df
    gc.collect()
    
    total_rows = len(X)
    
    train_params = default_params.copy()
    if 'importance_type' in train_params:
        train_params.pop('importance_type')
    
    # 청크 단위로 학습
    for chunk_start in range(0, total_rows, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_rows)
        X_chunk = X.iloc[chunk_start:chunk_end]
        y_chunk = y.iloc[chunk_start:chunk_end]
        
        # 학습 데이터셋 생성
        train_data = lgb.Dataset(X_chunk, y_chunk)
        
        if first_iteration:
            # 첫 번째 학습
            lgbm_model.fit(X_chunk, y_chunk)
           
        # else:
        #     # 증분 학습
        #     booster = lgb.train(
        #         params=train_params,
        #         train_set=train_data,
        #         num_boost_round=default_params.get('n_estimators', 100),
        #         init_model=booster
        #     )
        #     # 새로운 모델 생성 및 booster 적용
        #     lgbm_model = LGBMRegressor(**default_params)
        #     lgbm_model._Booster = booster
        #     lgbm_model._objective = booster.params['objective']
        #     lgbm_model._n_features = booster.num_feature()
        #     lgbm_model._n_classes = 1
        #     lgbm_model._classes = None
        #     lgbm_model._n_features_in = booster.num_feature()
        #     lgbm_model._features_in = None
    
        # del X_chunk, y_chunk
        # gc.collect()
            
    # 모델 저장
    if save_model_path:
        lgbm_model.booster_.save_model(save_model_path)
        print(f"Saved model to {save_model_path}")
    
    # 메모리 정리
    del X, y
    gc.collect()
    
    return lgbm_model

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# print("---------------og_train------------------")
# og_model = og_train(1_000_000, 5_000_000, "jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet", verbose=1)
# save_model(og_model, "og_model", save_dir="experiment_models", use_pickle=True)
# del og_model
# gc.collect()

chunk_size = 500000
partition = 1
# print("---------------selected_feature_train------------------")
# selected_feature_model = selected_feature_train(partition=partition,
#                                                 model_path=None,
#                                                 first_iteration=True,
#                                                 chunk_size=chunk_size)
# save_model(selected_feature_model, "selected_feature_model", partition_num=partition, save_dir="experiment_models", use_pickle=True)
# del selected_feature_model
# gc.collect()

print("---------------selected_feature_rolling_train------------------")
selected_feature_rolling_model = selected_feature_rolling_train(partition=partition,
                                                                model_path=None,
                                                                first_iteration=True,
                                                                chunk_size=chunk_size)
save_model(selected_feature_rolling_model, "selected_feature_rolling_model", partition_num=partition, save_dir="experiment_models", use_pickle=True)

---------------selected_feature_rolling_train------------------


In [20]:
save_model(selected_feature_rolling_model, "selected_feature_rolling_model", partition_num=partition, save_dir="experiment_models", use_pickle=True)
selected_feature_rolling_model

Model saved to: experiment_models/selected_feature_rolling_model_1.pkl


In [6]:
selected_rolling_model_path = "experiment_models/selected_feature_rolling_model_1.pkl"
with open(selected_rolling_model_path, 'rb') as f:
    model = pickle.load(f)
    
    
model

In [9]:
model.predict

<bound method LGBMModel.predict of LGBMRegressor(random_state=42, verbose=0)>

In [None]:
selected_model_path = "experiment_models/selected_feature_model_1.pkl"

with open(selected_model_path, 'rb') as f:
    selected_model = pickle.load(f)
  
selected_model
# # 1. 모델의 속성 확인
# print(selected_model.__dict__)

# # 2. is_fitted_ 속성 확인
# try:
#     print(selected_model.is_fitted_)
# except:
#     print("is_fitted_ 속성이 없음")

# # 3. 저장 전 모델 상태 출력
# print("모델 타입:", type(selected_model))
# print("모델 파라미터:", selected_model.get_params())

<lightgbm.basic.Booster at 0x7fbd90737190>

In [9]:
columns = pq.read_schema(f'eda_chunks/final_rolling_{str(2).zfill(4)}.parquet').names
columns = pd.Index(columns)  # pandas Index로 변환하여 str.contains 사용 가능하게 함

# 필요한 컬럼을 선택합니다
not_rolling_feature_col = columns[
        (columns.str.contains("feature")) & 
        (~columns.str.contains("rolling"))
    ].to_list()

rolling_feature_col = columns[
        (columns.str.contains("rolling")) & 
        (~columns.str.contains("rolling_2")) & 
        (~columns.str.contains("rolling_30")) & 
        (~columns.str.contains("rolling_1000"))
    ].to_list()

    # 필요한 컬럼만 읽습니다
selected_columns = not_rolling_feature_col + rolling_feature_col + ['responder_6'] + ['weight']
df = pd.read_parquet(f'eda_chunks/final_rolling_{str(2).zfill(4)}.parquet', columns=selected_columns)

    # X와 y 분리
X = df[:1000000][not_rolling_feature_col + rolling_feature_col]
y = df[:1000000]['responder_6']

In [12]:
import numpy as np
def r2_metric(y_true, y_pred, weights=None):
    """Calculate weighted R2 score"""
    y_true = y_true.ravel()
    y_pred = y_pred.ravel()

    # If weights is None, use uniform weights
    if weights is None:
        weights = np.ones_like(y_true)
    else:
        weights = weights.ravel()

    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return 'r2', r2_score, True

r2_metric(y[:1000000], model.predict(X),df[:1000000]['weight'])

  y_true = y_true.ravel()
  weights = weights.ravel()


('r2', np.float64(0.08337754568080169), True)