In [None]:
import pandas as pd

train_df_csv = pd.read_csv('../data/train_processed.csv', delimiter=';')
test_df_csv = pd.read_csv('../data/test_processed.csv', delimiter=';')

numeric_features = [
    'request_ts',
    'component0',
    'component1',
    'component2',
    'component3',
    'component4',
    'component5',
    'component6',
    'component7',
    'component8',
    'component9'
]  
categorial_features = [
    'user_id',
    'country_id',
    'region_id',
    'timezone_id',
    'browser',
    'browser_version',
    'os',
    'os_version',
    'target'
]

from sklearn.pipeline import Pipeline  # для создания pipeline
from sklearn.compose import ColumnTransformer  # для преобразования колонок
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Импортируем RandomForestClassifier

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split  
from sklearn.metrics import f1_score  
from sklearn.model_selection import StratifiedKFold

class QuantileReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.05):  # Исправлено на __init__
        self.threshold = threshold
        self.quantiles = {}

    def fit(self, X, y=None):
        for col in X.select_dtypes(include='number'):
            low_quantile = X[col].quantile(self.threshold)
            high_quantile = X[col].quantile(1 - self.threshold)
            self.quantiles[col] = (low_quantile, high_quantile)
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in X.select_dtypes(include='number'):
            low_quantile, high_quantile = self.quantiles[col]
            rare_mask = ((X[col] < low_quantile) | (X[col] > high_quantile))
            if rare_mask.any():
                rare_values = X_copy.loc[rare_mask, col]
                replace_value = np.mean([low_quantile, high_quantile])
                if rare_values.mean() > replace_value:
                    X_copy.loc[rare_mask, col] = high_quantile
                else:
                    X_copy.loc[rare_mask, col] = low_quantile
        return X_copy
    
class RareGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.05, other_value='Other'):  # Исправлено на __init__
        self.threshold = threshold
        self.other_value = other_value
        self.freq_dict = {}

    def fit(self, X, y=None):
        for col in X.select_dtypes(include=['object']):
            freq = X[col].value_counts(normalize=True)
            self.freq_dict[col] = freq[freq >= self.threshold].index.tolist()
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in X.select_dtypes(include=['object']):
            X_copy[col] = X_copy[col].apply(lambda x: x if x in self.freq_dict[col] else self.other_value)
        return X_copy

num_pipe_request_ts = Pipeline([
    ('QuantReplace', QuantileReplacer(threshold=0.01)),
    ('scaler', StandardScaler())
])
num_request_ts = ['request_ts']

cat_pipe_country_id = Pipeline([
    ('replace_rare', RareGrouper(threshold=0.001, other_value='Other')),
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
])
cat_country_id = ['country_id']

cat_pipe_region_id = Pipeline([
    ('replace_rare', RareGrouper(threshold=0.001, other_value='Other')),
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
])
cat_region_id = ['region_id']

cat_pipe_timezone_id = Pipeline([
    ('replace_rare', RareGrouper(threshold=0.001, other_value='Other')),
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
])
cat_timezone_id = ['timezone_id']


In [None]:
# Объединяем все преобразования в один ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe_request_ts, num_request_ts),
        ('cat_country', cat_pipe_country_id, cat_country_id),
        ('cat_region', cat_pipe_region_id, cat_region_id),
        ('cat_timezone', cat_pipe_timezone_id, cat_timezone_id),
        # Добавьте другие категориальные и числовые каналы по мере необходимости
    ]
)

# Создаем полный pipeline с RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # Используем RandomForestClassifier
])

# Теперь вы можете разделить данные на обучающую и тестовую выборки и обучить модель
X_train, X_test, y_train, y_test = train_test_split(train_df_csv[numeric_features + categorial_features], train_df_csv['target'], test_size=0.2, random_state=42)

# Обучаем модель
pipeline.fit(X_train, y_train)

# Предсказания
y_pred = pipeline.predict(X_test)

# Оценка модели
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')
