In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import category_encoders as ce
from typing import Tuple
import time

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
)

Примерно так будет выглядеть каждый из ваших датасетов для пользователя. Датасеты будет удобно объединять в классы для удобного использования

In [4]:
class DatasetName:
    train_data: pd.DataFrame
    test_data: pd.DataFrame
    
    # do not call without fabric method
    def __init__(self, train_data: pd.DataFrame, test_data: pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data

    @classmethod
    def load(cls) :
        """
        loads datasets with given name
        :return: (train_dataframe, test_dataframe)
        """
        train = pd.read_csv("data/rain_train.csv")
        return DatasetName(train, None)

    def prepare(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        does preprocessing with dataset, feature generation, etc.
        :return: (train_x, train_y, test_x, test_y)
        """
        TARGET = 'RainTomorrow'
        y = self.train_data[TARGET]
        y = y.apply(lambda x: 1 if x == 'Yes' else 0)
        self.train_data.drop(labels=TARGET, axis=1, inplace=True)
        x_train, x_test, y_train, y_test = train_test_split(self.train_data, y, test_size=0.3, 
                                                            random_state=1729,
                                                            stratify=y)
        
        categorical = [col for col in x_train.columns if x_train[col].dtypes == 'O']
        numerical = [col for col in x_train.columns if x_train[col].dtypes != 'O']
        
        
        # impute missing values in X_train and X_test with respective column median in X_train
        for df1 in [x_train, x_test]:
            for col in numerical:
                col_median = x_train[col].median()
                df1[col].fillna(col_median, inplace=True)    

        
        # impute missing categorical variables with most frequent value
        for df2 in [x_train, x_test]:
            df2['WindGustDir'].fillna(x_train['WindGustDir'].mode()[0], inplace=True)
            df2['WindDir9am'].fillna(x_train['WindDir9am'].mode()[0], inplace=True)
            df2['WindDir3pm'].fillna(x_train['WindDir3pm'].mode()[0], inplace=True)
            df2['RainToday'].fillna(x_train['RainToday'].mode()[0], inplace=True)
        
        # engineering outliers in numerical variables
        def max_value(df3, variable, top):
            return np.where(df3[variable]>top, top, df3[variable])

        for df3 in [x_train, x_test]:
            df3['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
            df3['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
            df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
            df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)
        
        # encode RainToday variable
        encoder = ce.BinaryEncoder(cols=['RainToday'])
        x_train = encoder.fit_transform(x_train)
        x_test = encoder.transform(x_test)
        
        
        x_train = pd.concat([x_train[numerical], x_train[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(x_train.Location), 
                     pd.get_dummies(x_train.WindGustDir),
                     pd.get_dummies(x_train.WindDir9am),
                     pd.get_dummies(x_train.WindDir3pm)], axis=1)
        
        x_test = pd.concat([x_test[numerical], x_test[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(x_test.Location), 
                     pd.get_dummies(x_test.WindGustDir),
                     pd.get_dummies(x_test.WindDir9am),
                     pd.get_dummies(x_test.WindDir3pm)], axis=1)
        
        return x_train, y_train, x_test, y_test

Вот пример того, как может выглядеть использование подобного интерфейса

In [3]:
x_train, y_train, x_test, y_test = DatasetName.load().prepare()