In [0]:
!pip install category_encoders

In [0]:
from  datetime import datetime, timedelta
import gc
import pickle
import numpy as np, pandas as pd
import lightgbm as lgb

import category_encoders as ce
# Настройки отображения в pandas
pd.options.display.max_columns = 50

In [0]:
class Memory_reducer:
  '''
  Reduce RAM usage
  '''
  def __init__():
    pass
  def DF_mem_reduce(df):
    '''
    Reduce memory size of data frame
    '''
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns
    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
class My_Reader(Memory_reducer):
  def __init__(self, mem_red=True, info=False):
    self.mem_red = mem_red
    self.info = info
  def mem_reduce(df):
    Memory_reducer.DF_mem_reduce(df)
  def read_file(self, path, params=False):
    name = (path.split('\\')[-1])
    print(f'reading file {name}  ', end='')
    if not params:
      data = pd.read_csv(path)
    else:
      data = pd.read_csv(path, **params)
    print('OK')
    if self.mem_red:
      My_Reader.mem_reduce(data)
    if self.info:
      display(data)
    gc.collect()
    return data   

In [0]:
class Fea_Fabric:
  ''' Produce some features '''
  def __init__(self):
    pass

  def add_lag(self,  col_name, lag, group=False):
    ''' Add one lag '''
    lag_col = f"{col_name}_lag_{lag}"
    if group:
      if type(group) is list: temp = group.append(col_name) 
      else: temp = [group, col_name]
      self.data[lag_col] = self.data[temp].groupby(group)[col_name].shift(lag)
    else:
      self.data[lag_col] = self.data[col_name].shift(lag)

  def add_lags(self, col_name, lags, group=False,):
    ''' Add many lag '''
    if type(lags) is list:
      for lag in lags:
        self.add_lag(col_name, lag, group)
    else:
      self.add_lag(col_name, lag, group)

  def add_rolling_window(self, col_name, win, func, group=False):
    ''' Add rolling window '''
    new_col = f'r{func}_{col_name}_{win}'
    if group:
      if type(group) is list: temp = group.append(col_name)
      else: temp = [group, col_name]
      self.data[new_col] = self.data[temp] \
                               .groupby(group)[col_name] \
                               .transform(lambda x : x.rolling(win).agg(func))
    else:
      self.data[new_col] = self.data[temp] \
                               .transform(lambda x : x.rolling(win).agg(func))


  def add_rolling_windows(self, col_name, wins, func, group=False):
    ''' Add many rolling windows '''
    for win in wins:
      self.add_rolling_window(col_name, win, func, group)
  
  def add_date_features(self, date_col):
    ''' 
    Convert datetime column into separate year, month,
    week of year, quarter, week day, month day features
    '''
    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
                    }
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in self.data.columns:
            self.data[date_feat_name] = self.data[date_feat_name].astype("int16")
        else:
            self.data[date_feat_name] = getattr(self.data[date_col].dt, date_feat_func).astype("int16")

  def target_encoding(self, cat_col):
    ''' Target encoding one or meny columns '''
    target_enc = ce.TargetEncoder(cols=cat_col)

    # Fit the encoder using the categorical features and target
    target_enc.fit(self.train[cat_col], self.train[self.target])

    # Transform the features, rename the columns with _target suffix, and join to dataframe
    self.train = self.train.join(target_enc.transform(self.train[cat_col]).add_suffix('_target'))
    self.valid = self.valid.join(target_enc.transform(self.valid[cat_col]).add_suffix('_target'))

In [0]:
class Custom_CV_Splitter:
  ''' Create custom splitter for cross validation '''
  def __init__(self, data, folds):
    self.data = data
    self.folds = folds
  
  def get_slice(self):
    pass

class Custom_TSS(Custom_CV_Splitter):
  ''' Create time series splitter '''
  def __init__(self, data, folds):
    Custom_CV_Splitter.__init__(self, data, folds)

  def make_sample_gen(self):
    ''' Make time series split generator '''
    for start, split, end in self.folds:
      train_index = self.data.loc[(self.data >= start) & (self.data < split)].index
      valid_index = self.data.loc[(self.data >= split) & (self.data <= end)].index
      yield (list(train_index), list(valid_index))

In [0]:
class My_Pipeline(Fea_Fabric):
  def __init__(self, seed=42, **kwargs):
    np.random.seed(seed)
    for key, value in kwargs.items()
      self.__dict__[key] = value
    
  def __repr__(self):
    return self.data.__repr__()
  
  def save_data(self):
    F = open('M5_data.pkl', 'wb')
    pickle.dump(self.data, F)
    F.close
  
  def continue_work(self):
    F = open('M5_data.pkl', 'rb')
    self.data = pickle.load(F)


  def cat_encoding(data):
    ''' Convert cat features into int16  '''
    for col in data.columns:
      if str(data[col].dtype) == 'category':
        data[col] = data[col].cat.codes.astype('int16')
        data[col] -= data[col].min()

  # def read_data(self, is_train = True, nrows = None, first_day = 1200,):
  def read_data(self,):
    ''' Read and tansform data into one data frame '''
    pass
  
  def add_features(self, fea_dict):
    ''' Add many different features at the sametime '''
    for methods, params in fea_dict.items():
      Fea_Fabric.__dict__[methods](self, **params)
  
  def train_val_test_split(self, col_name, split_points):
    ''' Split set in train, validation and test samples ''' 
    self.train = self.data.loc[self.data[col_name] <= split_points[0]]
    self.valid = self.data.loc[(self.data[col_name] > split_points[0]) & (self.data[col_name] <= split_point[1])]
    self.test = self.data.loc[self.data[col_name] > split_points[1]]
    
    del self.data
    gc.collect()

  def features_filter(data, useful_fea=None, useless_fea=None):
    if useful_fea:
      useless_fea = data.columns[~data.columns.isin(useless_fea)]  
    data.drop(columns=useless_fea, inplace=True)
    gc.collect()

  def create_lgb_dataset(self, cat_feas='auto', train=False, valid=False, test=True,
                         useful_fea=None, useless_fea=None, drop_nan=True):
    ''' Create lgb datasets '''
    if drop_nan:
      self.train.dropna(inplace=True)
    if train:
      My_Pypline.features_filter(self.train, useful_fea=useful_fea, useless_fea=useless_fea)
      self.train_data = lgb.Dataset(self.train.drop(columns=[self.target]), label = self.train[self.target],
                                    categorical_feature=cat_feas,
                                    free_raw_data=False)
    if valid:
      My_Pypline.features_filter(self.train, useful_fea=useful_fea, useless_fea=useless_fea)
      self.valid_data = lgb.Dataset(self.valid.drop(columns=[self.target]), label = self.train[self.target],
                                    categorical_feature=cat_feas,
                                    free_raw_data=False)
    
    del self.valid, self.train
    gc.collect()


  
  def set_CV(self, CV=False, folds=None):
    ''' Create TSS generator '''
    if CV == 'tss':
      tss = Custom_TSS(self.date_list, folds)
      self.CV_gen = tss.make_sample_gen()

  def fit_model(self, params, CV=False,):
    if CV:
      self.lgb_model = lgb.train(params, self.train_data, valid_sets = self.valid_data, verbose_eval=20)
    else:
      self.lgb_model = lgb.train(params, self.train_data, verbose_eval=20)
  
  def predict(self):
    self.test.drop(columns=[self.target])
    self.predictions = self.test['id']
    self.predictions[self.target] = self.lgb_model.predict(self.test)