In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from load_data import load_data_from_csv
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import (
    BasicFeaturesPreprocessor,
    DupletsTripletsPreprocessor,
    MovingAvgPreProcessor,
    RemoveIrrelevantFeaturesDataPreprocessor,
    DropTargetNADataPreprocessor,
    FarNearPriceFillNaPreprocessor,
    MovingAvgFillNaPreprocessor,
    RemoveRecordsByStockDateIdPreprocessor,
)
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor
from data_preprocessor.stockid_features import StockIdFeaturesPreProcessor
from data_preprocessor.deep_feature_synthesis import DfsPreProcessor
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator

from model_pipeline.lgb_pipeline import LGBModelPipelineFactory

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline
from train_pipeline.train_optuna_pipeline import DefaultOptunaTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator

import optuna.integration.lightgbm as lgb
import optuna

import numpy as np

import sys
import pandas as pd

In [3]:
from tsfresh import extract_features

In [4]:
N_fold = 5
model_save_dir = './models/'

processors = [
    RemoveRecordsByStockDateIdPreprocessor([
        {"stock_id": 19, "date_id": 438},
        {"stock_id": 101, "date_id": 328},
        {"stock_id": 131, "date_id": 35},
        {"stock_id": 158, "date_id": 388},
    ]),
    FarNearPriceFillNaPreprocessor(),
    ReduceMemUsageDataPreprocessor(verbose=True),
    # BasicFeaturesPreprocessor(),    
    # DupletsTripletsPreprocessor(),
    # MovingAvgPreProcessor("wap"),
    # MovingAvgFillNaPreprocessor("wap", 1.0),
    # StockIdFeaturesPreProcessor(),  
    # DropTargetNADataPreprocessor(),    
    # RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    # FillNaPreProcessor(),
    # PolynomialFeaturesPreProcessor(),
]

# processors = [    
#     ReduceMemUsageDataPreprocessor(verbose=True),
#     # BasicFeaturesPreprocessor(),
#     # DupletsTripletsPreprocessor(),
#     # MovingAvgPreProcessor("wap"),   
#     # StockIdFeaturesPreProcessor(),   
#     # DTWKMeansPreprocessor(),    
#     DropTargetNADataPreprocessor(),    
#     # RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
#     # DfsPreProcessor(),
#     # FillNaPreProcessor(),
#     # PolynomialFeaturesPreProcessor(),
# ]


processor = CompositeDataPreprocessor(processors)



In [5]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

raw_data = df_train
# df_train = df_train[:100000]


Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [6]:
# df_train = raw_data

In [7]:
# df_train = ReduceMemUsageDataPreprocessor(verbose=True).apply(df_train)
df_train = processor.apply(df_train)

CompositeDataPreprocessor - original df shape: (5237980, 17)
Processing RemoveRecordsByStockDateIdPreprocessor...
RemoveRecordsByStockDateIdPreprocessor - removing 220 records
RemoveRecordsByStockDateIdPreprocessor took 0.77s. New df shape: (5237760, 17).
Processing FarNearPriceFillNaPreprocessor...
FarNearPriceFillNaPreprocessor took 0.14s. New df shape: (5237760, 17).
Processing ReduceMemUsageDataPreprocessor...
Memory usage of dataframe is 719.30 MB
Memory usage after optimization is: 344.66 MB
Decreased by 52.08%
dtypes:
stock_id                     int16
date_id                      int16
seconds_in_bucket            int16
imbalance_size             float32
imbalance_buy_sell_flag       int8
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap           

In [8]:
df_train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180603.0,1,0.999812,13380277.0,1.0,1.0,0.999812,60651.5,1.000026,8493.030273,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.9,-1,0.999896,1642214.25,1.0,1.0,0.999896,3233.040039,1.00066,20605.089844,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.9,-1,0.999561,1819368.0,1.0,1.0,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917680.0,-1,1.000171,18389746.0,1.0,1.0,0.999999,2324.899902,1.000214,479032.40625,1.0,-4.010201,0,0_0_3
4,4,0,0,447550.0,-1,0.999532,17860614.0,1.0,1.0,0.999394,16485.539062,1.000016,434.100006,1.0,-7.349849,0,0_0_4


In [9]:
extracted_features = extract_features(df_train, column_id="stock_id", column_sort="time_id")

Feature Extraction:   0%|          | 0/40 [04:03<?, ?it/s]


MemoryError: Unable to allocate 2.61 GiB for an array with shape (26454, 26454, 2) and data type int16

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y)

In [None]:
from tsfresh import extract_relevant_features

features_filtered_direct = extract_relevant_features(timeseries, y,
                                                     column_id='id', column_sort='time')