In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
from typing import Tuple
import time

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
)

Примерно так будет выглядеть каждый из ваших датасетов для пользователя. Датасеты будет удобно объединять в классы для удобного использования

In [4]:
class DatasetName:
    train_data: pd.DataFrame
    test_data: pd.DataFrame
    
    # do not call without fabric method
    def __init__(self, train_data: pd.DataFrame, test_data: pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data

    @classmethod
    def load(cls) :
        """
        loads datasets with given name
        :return: (train_dataframe, test_dataframe)
        """
        train = pd.read_csv("data/cat_train.csv")
        return DatasetName(train, None)

    def prepare(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        does preprocessing with dataset, feature generation, etc.
        :return: (train_x, train_y, test_x, test_y)
        """
        TARGET = 'target'
        y = self.train_data[TARGET]
        self.train_data.drop(labels=TARGET, axis=1, inplace=True)
        
        x_train, x_test, y_train, y_test = train_test_split(self.train_data, y, test_size=0.3,random_state=1729,
                                                            stratify=y)
        
        nom_cols = [col  for col in x_train.columns.values if col.startswith('nom')]
        
        def ord_to_num(df, col):
            if not isinstance(col, list):
                col = [col]
            for c in col:
                keys=np.sort(df[c].unique())
                values=np.arange(len(keys))
                map = dict(zip(keys, values))
                df[c] = df[c].replace(map)
        
        ord_to_num(x_test, ['ord_3', 'ord_4', 'ord_5'])
        ord_to_num(x_train, ['ord_3', 'ord_4', 'ord_5'])
        
        keys_ord_1 = x_train.ord_1.unique()
        values_ord_1 = [3,4,0,1,2]
        map_ord_1 = dict(zip(keys_ord_1, values_ord_1))
        
        x_train['ord_1'] = x_train['ord_1'].replace(map_ord_1)
        x_test['ord_1'] = x_test['ord_1'].replace(map_ord_1)
        
        keys_ord_2 = x_train.ord_2.unique()
        values_ord_2 = [1,3,5,4,0,2]
        map_ord_2 = dict(zip(keys_ord_2, values_ord_2))
        
        x_train['ord_2'] = x_train['ord_2'].replace(map_ord_2)
        x_test['ord_2'] = x_test['ord_2'].replace(map_ord_2)
        
        
        x_train['ord_4_band'] = pd.qcut(x_train['ord_4'], 6)
        bands = x_train.ord_4_band.unique()
        keys_bands = np.sort(bands)
        values_bands = np.arange(len(keys_bands))
        map_bands = dict(zip(keys_bands, values_bands))
        
        x_train['ord_4_band'] = x_train['ord_4_band'].replace(map_bands)
        x_test['ord_4_band'] = pd.cut(x_test.ord_4, pd.IntervalIndex(keys_bands))
        x_test['ord_4_band'] = x_test['ord_4_band'].replace(map_bands)
        
        x_train['ord_5_band'] = pd.qcut(x_train['ord_5'], 6)
        bands = x_train.ord_5_band.unique()
        keys_bands = np.sort(bands)
        values_bands = np.arange(len(keys_bands))
        map_bands = dict(zip(keys_bands, values_bands))
        
        x_train['ord_5_band'] = x_train['ord_5_band'].replace(map_bands)
        x_test['ord_5_band'] = pd.cut(x_test.ord_5,pd.IntervalIndex(keys_bands))
        x_test['ord_5_band'] = x_test['ord_5_band'].replace(map_bands)
        
        labelEnc = LabelEncoder()
        
        for col in nom_cols:
            x_train[col] = labelEnc.fit_transform(x_train[col])
            x_test[col] = labelEnc.fit_transform(x_test[col])
            
        for col in ['bin_3', 'bin_4']:
            x_train[col]=labelEnc.fit_transform(x_train[col])
            x_test[col]=labelEnc.fit_transform(x_test[col])

        return x_train, y_train, x_test, y_test

Вот пример того, как может выглядеть использование подобного интерфейса

In [3]:
x_train, y_train, x_test, y_test = DatasetName.load().prepare()