In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from load_data import load_data_from_csv
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import BasicFeaturesPreprocessor, DupletsTripletsPreprocessor, MovingAvgPreProcessor, RemoveIrrelevantFeaturesDataPreprocessor, DropTargetNADataPreprocessor, DTWKMeansPreprocessor
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor
from data_preprocessor.stockid_features import StockIdFeaturesPreProcessor
from data_preprocessor.deep_feature_synthesis import DfsPreProcessor
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator

from model_pipeline.lgb_pipeline import LGBModelPipelineFactory

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline
from train_pipeline.train_optuna_pipeline import DefaultOptunaTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator

import optuna.integration.lightgbm as lgb
import optuna

import numpy as np

import sys

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x145b7771b920>
Traceback (most recent call last):
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^

In [3]:
import pandas as pd

In [4]:
N_fold = 5
model_save_dir = './models/'

processors = [    
    ReduceMemUsageDataPreprocessor(verbose=True),
    # BasicFeaturesPreprocessor(),
    # DupletsTripletsPreprocessor(),
    # MovingAvgPreProcessor("wap"),   
    # StockIdFeaturesPreProcessor(),   
    # DTWKMeansPreprocessor(),    
    DropTargetNADataPreprocessor(),    
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id']),
    # DfsPreProcessor(),
    # FillNaPreProcessor(),
    # PolynomialFeaturesPreProcessor(),
]


processor = CompositeDataPreprocessor(processors)



In [5]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

raw_data = df_train
# df_train = df_train[:100000]


Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [6]:
# df_train = raw_data

In [7]:
# df_train = ReduceMemUsageDataPreprocessor(verbose=True).apply(df_train)
df_train = processor.apply(df_train)

CompositeDataPreprocessor - original df shape: (5237980, 17)
Processing ReduceMemUsageDataPreprocessor...
Memory usage of dataframe is 679.36 MB
Memory usage after optimization is: 304.72 MB
Decreased by 55.15%
dtypes:
stock_id                     int16
date_id                      int16
seconds_in_bucket            int16
imbalance_size             float32
imbalance_buy_sell_flag       int8
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap                        float32
target                     float32
time_id                      int16
row_id                      object
dtype: object
ReduceMemUsageDataPreprocessor took 0.38s. New df shape: (5237980, 17).
Processing DropTargetNADataPreprocessor...
DropTargetNADataPreprocessor took 0.44s. New df shape: (5

In [8]:
import featuretools as ft

In [9]:
df_ = df_train.copy()

es = ft.EntitySet(id = 'closing_movements_data')
# es = es.entity_from_dataframe(entity_id = 'df', dataframe = df_, index = 'row_id')


In [10]:
from woodwork.logical_types import Categorical

In [11]:
es = es.add_dataframe(
    dataframe_name="closing_movements",
    dataframe=df_,
    index="row_id",
    # time_index="time_id",
    logical_types={
        "imbalance_buy_sell_flag": Categorical,
        # "zip_code": PostalCode,
    },
)


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


In [12]:
es

Entityset: closing_movements_data
  DataFrames:
    closing_movements [Rows: 5237892, Columns: 14]
  Relationships:
    No relationships

In [13]:
es["closing_movements"].ww.schema

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
seconds_in_bucket,Integer,['numeric']
imbalance_size,Double,['numeric']
imbalance_buy_sell_flag,Categorical,['category']
reference_price,Double,['numeric']
matched_size,Double,['numeric']
far_price,Double,['numeric']
near_price,Double,['numeric']
bid_price,Double,['numeric']
bid_size,Double,['numeric']
ask_price,Double,['numeric']


In [None]:
feature_matrix, feature_defs = ft.dfs(entityset = es, 
                                      target_dataframe_name = 'closing_movements',
                                      trans_primitives = ['add_numeric',],
                                                          # 'multiply_numeric',
                                      verbose=True,
                                      chunk_size=.05)

feature_matrix.head()

Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created


Built 79 features
Elapsed: 00:19 | Progress:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       

In [None]:
stocks_df = pd.DataFrame()
stocks_df["stock_id"] = pd.Series(pd.unique(df_["stock_id"]))
stocks_df["dummy"] = pd.Series(pd.unique(df_["stock_id"]))
stocks_df

In [None]:
es = es.add_dataframe(
    dataframe_name="stocks", dataframe=stocks_df, index="stock_id"
)

es

In [None]:
es = es.add_relationship("stocks", "stock_id", "closing_movements", "stock_id")
es

In [None]:
es["closing_movements"].ww.schema

In [None]:
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "numwords", "characters"]

In [None]:
feature_matrix, feature_defs = ft.dfs(entityset=es, 
                                    target_dataframe_name="stocks",
                                    # trans_primitives = default_trans_primitives,
                                    agg_primitives=default_agg_primitives, 
                                    max_depth = 2)
feature_matrix

In [None]:
feature_matrix.columns

In [None]:
ft.selection.remove_highly_null_features(feature_matrix)

In [None]:
from featuretools.selection import (
    remove_highly_correlated_features,
    remove_highly_null_features,
    remove_single_value_features,
)

In [None]:
new_fm, new_features = remove_single_value_features(feature_matrix, features=feature_defs)
new_fm

In [None]:
new_fm2, new_features2 = remove_highly_correlated_features(new_fm, features=new_features)
new_fm2.head()

In [None]:
new_fm2.drop(['dummy'], axis = 1, 

In [None]:
df_

In [None]:
df_.merge(new_fm2, left_on = "stock_id", right_on = "stock_id", how = "left")

In [None]:
feature_matrix2, feature_defs2 = ft.dfs(
    entityset=es,
    target_dataframe_name="stocks",
    agg_primitives=["mean", "sum", "mode"],
    # trans_primitives=["month", "hour"],
    max_depth=2,
)
feature_matrix2

In [None]:
# feature_matrix, feature_defs = ft.dfs(entityset=es, 
#                                     target_dataframe_name="stocks")
# feature_matrix.columns

In [None]:
import numpy as np
import pandas as pd
import featuretools as ft
from data_preprocessor.data_preprocessor import DataPreprocessor

class DfsPreProcessor(DataPreprocessor):
    def apply(self, df):

        df_ = df.copy()

        es = ft.EntitySet(id = 'train_df')
        # es = es.entity_from_dataframe(entity_id = 'df', dataframe = df_, index = 'row_id')
        es = es.add_dataframe(
            dataframe_name="closing_movements",
            dataframe=df_,
            index="row_id",
            time_index="time_id",
            # logical_types={
            #     "product_id": Categorical,
            #     "zip_code": PostalCode,
            # },
        )

        print(es["closing_movements"].ww.schema)

        default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
        default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "numwords", "characters"]

        feature_names = ft.dfs(entityset = es, 
                            #    target_entity = 'df',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       max_depth = 2, features_only=True)
        
        print(feature_names)        

        return df_

In [None]:
df_train = DfsPreProcessor().apply(df_train)