In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import time

from typing import Tuple

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
)

Примерно так будет выглядеть каждый из ваших датасетов для пользователя. Датасеты будет удобно объединять в классы для удобного использования

In [4]:
class DatasetName:
    train_data: pd.DataFrame
    test_data: pd.DataFrame
    
    # do not call without fabric method
    def __init__(self, train_data: pd.DataFrame, test_data: pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data

    @classmethod
    def load(cls) :
        """
        loads datasets with given name
        :return: (train_dataframe, test_dataframe)
        """
        train = pd.read_csv("data/may_train.csv")
        return DatasetName(train, None)

    def prepare(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        does preprocessing with dataset, feature generation, etc.
        :return: (train_x, train_y, test_x, test_y)
        """
        TARGET = 'target'
        y = self.train_data[TARGET]
        nons = y.isna()
        y = y[~nons]
        self.train_data = self.train_data[~nons]
        self.train_data.drop(labels=TARGET, axis=1, inplace=True)
        
        x_train, x_test, y_train, y_test = train_test_split(self.train_data, y, test_size=0.3, 
                                                            random_state=1729,
                                                            stratify=y)
        
        continuous_feat = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28']
        
        def stat_features(df, cols = continuous_feat):
            '''
            Calculate aggregated features across the selected continuous columns

            '''
            df['f_sum']  = df[continuous_feat].sum(axis=1)
            df['f_min']  = df[continuous_feat].min(axis=1)
            df['f_max']  = df[continuous_feat].max(axis=1)
            df['f_std']  = df[continuous_feat].std(axis=1)    
            df['f_mad']  = df[continuous_feat].mad(axis=1)
            df['f_mean'] = df[continuous_feat].mean(axis=1)
            df['f_kurt'] = df[continuous_feat].kurt(axis=1)

            df['f_prod'] = df[continuous_feat].prod(axis=1)
            df['f_range'] = df[continuous_feat].max(axis=1) - df[continuous_feat].min(axis=1)
            df['f_count_pos']  = df[df[continuous_feat].gt(0)].count(axis=1)
            df['f_count_neg']  = df[df[continuous_feat].lt(0)].count(axis=1)

            return df
        
        x_train = stat_features(x_train, continuous_feat)
        x_test = stat_features(x_test, continuous_feat)
        
        
        
        encoder = LabelEncoder()
        
        def encode_features(df, cols = ['f_27']):
            '''
            Apply one-hot encode to the selected columns, return a df
            '''
            for col in cols:
                df[col + '_enc'] = encoder.fit_transform(df[col])
            return df

        x_train = encode_features(x_train)
        x_test = encode_features(x_test)
        
        
        
        def calculate_feat_int(df):
            df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
            df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
            i_00_01_26 = df.f_00 + df.f_01 + df.f_26
            df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
            return df

        x_train = calculate_feat_int(x_train)
        x_test = calculate_feat_int(x_test)
        
        x_train.drop(labels=['id','f_25', 'f_26', "f_27", "f_28", "f_29", "f_30",
                      'f_27_enc'],axis=1, inplace=True)
        
        x_test.drop(labels=['id','f_25', 'f_26', "f_27", "f_28", "f_29", "f_30", 
                      'f_27_enc'],axis=1, inplace=True)
        return x_train, y_train, x_test, y_test

Вот пример того, как может выглядеть использование подобного интерфейса

In [3]:
x_train, y_train, x_test, y_test = DatasetName.load().prepare()