In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Union
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")

# ============================================
# 1. ГЕНЕРАЦИЯ СИНТЕТИЧЕСКИХ ДАННЫХ
# ============================================

def generate_synthetic_data(n_users=1000, n_items=500, n_interactions=5000):
    """Генерация реалистичных синтетических данных"""

    np.random.seed(42)

    # Пользователи
    users = np.arange(1, n_users + 1)

    # Демографические признаки пользователей
    user_features = {
        'user_id': users,
        'age': np.random.randint(18, 65, n_users),
        'gender': np.random.choice(['M', 'F'], n_users, p=[0.55, 0.45]),
        'location': np.random.choice(['NY', 'LA', 'SF', 'CH', 'MI'], n_users),
        'join_date': pd.date_range('2020-01-01', periods=n_users, freq='D'),
        'preferred_category': np.random.choice(['tech', 'sports', 'music', 'movies', 'books'], n_users)
    }

    # Товары
    items = np.arange(1, n_items + 1)

    # Контентные признаки товаров
    categories = ['electronics', 'books', 'clothing', 'home', 'sports', 'beauty']
    item_features = {
        'item_id': items,
        'category': np.random.choice(categories, n_items),
        'price': np.random.uniform(5, 500, n_items).round(2),
        'rating': np.random.uniform(1, 5, n_items).round(1),
        'reviews_count': np.random.randint(0, 1000, n_items),
        'description': [f"product_{i} description with features and benefits" for i in items]
    }

    # Взаимодействия пользователь-товар
    interactions = []
    for _ in range(n_interactions):
        user = np.random.choice(users)
        item = np.random.choice(items)
        rating = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.2, 0.4, 0.25])
        timestamp = pd.Timestamp('2023-01-01') + pd.Timedelta(days=np.random.randint(0, 365))
        interactions.append([user, item, rating, timestamp])

    # Контекстуальные данные (сессии)
    sessions = []
    for _ in range(n_interactions // 10):  # меньше сессий чем взаимодействий
        session_id = np.random.randint(1000, 9999)
        user = np.random.choice(users)
        session_items = np.random.choice(items, size=np.random.randint(2, 10), replace=False)
        for item in session_items:
            sessions.append([session_id, user, item])

    # Создаем DataFrames
    user_df = pd.DataFrame(user_features)
    item_df = pd.DataFrame(item_features)
    interactions_df = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'rating', 'timestamp'])
    sessions_df = pd.DataFrame(sessions, columns=['session_id', 'user_id', 'item_id'])

    # Добавляем дополнительные вычисляемые признаки
    # Популярность товара
    item_popularity = interactions_df.groupby('item_id').size().reset_index(name='popularity')
    item_df = item_df.merge(item_popularity, on='item_id', how='left').fillna(0)

    # Активность пользователя
    user_activity = interactions_df.groupby('user_id').size().reset_index(name='activity_count')
    user_df = user_df.merge(user_activity, on='user_id', how='left').fillna(0)

    return user_df, item_df, interactions_df, sessions_df

# ============================================
# 2. ФУНКЦИИ FEATURE EXTRACTION
# ============================================

class FeatureExtractor:
    """Извлечение признаков из разных источников"""

    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        self.item_svd = TruncatedSVD(n_components=10)
        self.user_svd = TruncatedSVD(n_components=10)

    def extract_user_features(self, user_df: pd.DataFrame) -> pd.DataFrame:
        """Извлечение признаков пользователей"""

        # Копируем для безопасности
        user_features = user_df.copy()

        # 1. Демографические признаки (one-hot encoding)
        demo_features = pd.get_dummies(
            user_features[['gender', 'location', 'preferred_category']],
            prefix=['gender', 'loc', 'cat']
        )

        # 2. Числовые признаки (normalization)
        numeric_cols = ['age', 'activity_count']
        user_features[numeric_cols] = self.scaler.fit_transform(user_features[numeric_cols])

        # 3. Временные признаки из join_date
        user_features['join_year'] = pd.to_datetime(user_features['join_date']).dt.year
        user_features['join_month'] = pd.to_datetime(user_features['join_date']).dt.month
        user_features['join_day'] = pd.to_datetime(user_features['join_date']).dt.day
        user_features['days_since_join'] = (
            pd.Timestamp.now() - pd.to_datetime(user_features['join_date'])
        ).dt.days

        # 4. Объединяем все признаки пользователей
        final_features = pd.concat([
            user_features[['user_id'] + numeric_cols + ['join_year', 'join_month', 'join_day', 'days_since_join']],
            demo_features
        ], axis=1)

        return final_features

    def extract_item_features(self, item_df: pd.DataFrame) -> pd.DataFrame:
        """Извлечение признаков товаров"""

        item_features = item_df.copy()

        # 1. Категориальные признаки (one-hot)
        cat_features = pd.get_dummies(
            item_features[['category']],
            prefix='cat'
        )

        # 2. Числовые признаки (normalization)
        numeric_cols = ['price', 'rating', 'reviews_count', 'popularity']
        item_features[numeric_cols] = self.scaler.fit_transform(item_features[numeric_cols])

        # 3. Текстовые признаки из описания (TF-IDF + SVD)
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(item_features['description'])
        svd_features = self.item_svd.fit_transform(tfidf_matrix)
        svd_cols = [f'item_svd_{i}' for i in range(svd_features.shape[1])]
        svd_df = pd.DataFrame(svd_features, columns=svd_cols)

        # 4. Объединяем все признаки товаров
        final_features = pd.concat([
            item_features[['item_id'] + numeric_cols],
            cat_features,
            svd_df
        ], axis=1)

        return final_features

    def extract_interaction_features(self, interactions_df: pd.DataFrame,
                                    user_features: pd.DataFrame,
                                    item_features: pd.DataFrame) -> pd.DataFrame:
        """Извлечение признаков взаимодействий"""

        interactions = interactions_df.copy()

        # 1. Временные признаки
        interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])
        interactions['hour'] = interactions['timestamp'].dt.hour
        interactions['day_of_week'] = interactions['timestamp'].dt.dayofweek
        interactions['month'] = interactions['timestamp'].dt.month

        # 2. Признаки на основе рейтинга
        interactions['rating_bin'] = pd.cut(interactions['rating'],
                                           bins=[0, 2, 3, 4, 5],
                                           labels=['low', 'medium_low', 'medium_high', 'high'])

        # 3. One-hot encoding временных признаков
        time_features = pd.get_dummies(
            interactions[['hour', 'day_of_week', 'month', 'rating_bin']],
            prefix=['hour', 'dow', 'month', 'rating']
        )

        # 4. Объединяем с признаками пользователей и товаров
        combined = interactions.merge(
            user_features, on='user_id', how='left'
        ).merge(
            item_features, on='item_id', how='left'
        )

        # 5. Удаляем исходные столбцы и объединяем с временными признаками
        columns_to_drop = ['timestamp', 'rating', 'hour', 'day_of_week', 'month', 'rating_bin']
        combined = combined.drop(columns=columns_to_drop, errors='ignore')
        combined = pd.concat([combined, time_features], axis=1)

        return combined.fillna(0)

    def extract_session_features(self, sessions_df: pd.DataFrame,
                               item_features: pd.DataFrame) -> pd.DataFrame:
        """Извлечение сессионных признаков"""

        sessions = sessions_df.copy()

        # 1. Агрегация на уровне сессии
        session_stats = sessions.groupby('session_id').agg({
            'item_id': ['count', lambda x: list(x)],
            'user_id': 'first'
        }).reset_index()

        session_stats.columns = ['session_id', 'session_length', 'item_sequence', 'user_id']

        # 2. Признаки сессии
        session_features = pd.DataFrame()
        session_features['session_id'] = session_stats['session_id']
        session_features['session_length'] = session_stats['session_length']

        # 3. Средние признаки товаров в сессии
        session_item_features = []
        for items in session_stats['item_sequence']:
            item_feats = item_features[item_features['item_id'].isin(items)]
            if len(item_feats) > 0:
                avg_features = item_feats.drop('item_id', axis=1).mean().values
            else:
                avg_features = np.zeros(item_features.shape[1] - 1)
            session_item_features.append(avg_features)

        # Создаем DataFrame с признаками товаров сессии
        session_item_df = pd.DataFrame(
            session_item_features,
            columns=[f'session_item_{i}' for i in range(len(session_item_features[0]))]
        )

        # 4. Объединяем все сессионные признаки
        final_features = pd.concat([session_features, session_item_df], axis=1)

        return final_features

# ============================================
# 3. КЛАСС ДЛЯ FEATURE COMBINATION
# ============================================

class FeatureCombinationRecommender:
    """
    Рекомендательная система с комбинацией признаков
    """

    def __init__(self, combination_method='concatenate'):
        """
        Args:
            combination_method: метод комбинации признаков
                - 'concatenate': простая конкатенация
                - 'weighted': взвешенная комбинация
                - 'neural': нейросетевая комбинация
                - 'attention': с механизмом внимания
        """
        self.combination_method = combination_method
        self.feature_extractor = FeatureExtractor()
        self.models = {}
        self.feature_weights = None
        self.scaler = StandardScaler()

    def prepare_features(self, user_df, item_df, interactions_df, sessions_df=None):
        """Подготовка и комбинация всех признаков"""

        print("Step 1: Extracting user features...")
        user_features = self.feature_extractor.extract_user_features(user_df)

        print("Step 2: Extracting item features...")
        item_features = self.feature_extractor.extract_item_features(item_df)

        print("Step 3: Extracting interaction features...")
        interaction_features = self.feature_extractor.extract_interaction_features(
            interactions_df, user_features, item_features
        )

        # Комбинируем признаки в зависимости от метода
        if self.combination_method == 'concatenate':
            combined_features = self._concatenate_features(
                interaction_features, user_features, item_features
            )
        elif self.combination_method == 'weighted':
            combined_features = self._weighted_combination(
                interaction_features, user_features, item_features
            )
        elif self.combination_method == 'neural':
            combined_features = self._neural_combination(
                interaction_features, user_features, item_features
            )
        elif self.combination_method == 'attention':
            combined_features = self._attention_combination(
                interaction_features, user_features, item_features
            )
        else:
            raise ValueError(f"Unknown combination method: {self.combination_method}")

        # Добавляем сессионные признаки если есть
        if sessions_df is not None and len(sessions_df) > 0:
            print("Step 4: Extracting session features...")
            session_features = self.feature_extractor.extract_session_features(
                sessions_df, item_features
            )
            combined_features = combined_features.merge(
                session_features, on='user_id', how='left'
            ).fillna(0)

        # Сохраняем информацию о признаках
        self.feature_info = {
            'user_features': list(user_features.columns),
            'item_features': list(item_features.columns),
            'interaction_features': list(interaction_features.columns),
            'combined_features': list(combined_features.columns)
        }

        print(f"Feature extraction complete!")
        print(f"  User features: {len(user_features.columns)}")
        print(f"  Item features: {len(item_features.columns)}")
        print(f"  Interaction features: {len(interaction_features.columns)}")
        print(f"  Total combined features: {len(combined_features.columns)}")

        return combined_features

    def _concatenate_features(self, interaction_features, user_features, item_features):
        """Простая конкатенация всех признаков"""

        # Объединяем все признаки
        combined = interaction_features.merge(
            user_features, on='user_id', how='left'
        ).merge(
            item_features, on='item_id', how='left'
        ).fillna(0)

        # Удаляем дублирующиеся колонки
        combined = combined.loc[:, ~combined.columns.duplicated()]

        return combined

    def _weighted_combination(self, interaction_features, user_features, item_features):
        """Взвешенная комбинация признаков"""

        # Получаем признаки отдельно
        interaction_only = interaction_features.drop(['user_id', 'item_id'], axis=1, errors='ignore')
        user_only = user_features.drop(['user_id'], axis=1, errors='ignore')
        item_only = item_features.drop(['item_id'], axis=1, errors='ignore')

        # Определяем веса (можно настраивать)
        interaction_weight = 0.4
        user_weight = 0.3
        item_weight = 0.3

        # Нормализуем признаки
        interaction_scaled = self.scaler.fit_transform(interaction_only)
        user_scaled = self.scaler.fit_transform(user_only)
        item_scaled = self.scaler.fit_transform(item_only)

        # Взвешенная комбинация
        weighted_features = []
        for i in range(len(interaction_features)):
            # Для каждого взаимодействия комбинируем признаки
            user_id = interaction_features.iloc[i]['user_id']
            item_id = interaction_features.iloc[i]['item_id']

            # Находим соответствующие признаки пользователя и товара
            user_idx = user_features[user_features['user_id'] == user_id].index
            item_idx = item_features[item_features['item_id'] == item_id].index

            if len(user_idx) > 0 and len(item_idx) > 0:
                user_vec = user_scaled[user_idx[0]]
                item_vec = item_scaled[item_idx[0]]
                inter_vec = interaction_scaled[i]

                # Взвешенная сумма
                combined = (
                    inter_vec * interaction_weight +
                    user_vec * user_weight +
                    item_vec * item_weight
                )

                weighted_features.append(combined)
            else:
                # Fallback: используем только interaction features
                weighted_features.append(inter_vec)

        # Создаем DataFrame с комбинированными признаками
        weighted_df = pd.DataFrame(
            weighted_features,
            columns=[f'weighted_feat_{i}' for i in range(weighted_features[0].shape[0])]
        )

        # Добавляем ID
        weighted_df['user_id'] = interaction_features['user_id'].values
        weighted_df['item_id'] = interaction_features['item_id'].values

        return weighted_df

    def _neural_combination(self, interaction_features, user_features, item_features):
        """Нейросетевая комбинация признаков"""

        # Преобразуем в тензоры
        inter_tensor = torch.FloatTensor(
            interaction_features.drop(['user_id', 'item_id'], axis=1, errors='ignore').values
        )
        user_tensor = torch.FloatTensor(
            user_features.drop(['user_id'], axis=1, errors='ignore').values
        )
        item_tensor = torch.FloatTensor(
            item_features.drop(['item_id'], axis=1, errors='ignore').values
        )

        # Простая нейросеть для комбинации
        class NeuralCombiner(nn.Module):
            def __init__(self, inter_dim, user_dim, item_dim, hidden_dim=64, output_dim=32):
                super().__init__()
                self.inter_fc = nn.Linear(inter_dim, hidden_dim)
                self.user_fc = nn.Linear(user_dim, hidden_dim)
                self.item_fc = nn.Linear(item_dim, hidden_dim)
                self.combine_fc = nn.Linear(hidden_dim * 3, output_dim)
                self.relu = nn.ReLU()

            def forward(self, inter_vec, user_vec, item_vec):
                inter_hidden = self.relu(self.inter_fc(inter_vec))
                user_hidden = self.relu(self.user_fc(user_vec))
                item_hidden = self.relu(self.item_fc(item_vec))

                combined = torch.cat([inter_hidden, user_hidden, item_hidden], dim=1)
                output = self.combine_fc(combined)
                return output

        # Инициализируем модель
        combiner = NeuralCombiner(
            inter_dim=inter_tensor.shape[1],
            user_dim=user_tensor.shape[1],
            item_dim=item_tensor.shape[1]
        )

        # Комбинируем признаки
        combined_features = []
        for i in range(len(interaction_features)):
            user_id = interaction_features.iloc[i]['user_id']
            item_id = interaction_features.iloc[i]['item_id']

            # Находим индексы
            user_idx = user_features[user_features['user_id'] == user_id].index
            item_idx = item_features[item_features['item_id'] == item_id].index

            if len(user_idx) > 0 and len(item_idx) > 0:
                # Комбинируем через нейросеть
                inter_vec = inter_tensor[i].unsqueeze(0)
                user_vec = user_tensor[user_idx[0]].unsqueeze(0)
                item_vec = item_tensor[item_idx[0]].unsqueeze(0)

                with torch.no_grad():
                    combined = combiner(inter_vec, user_vec, item_vec)
                combined_features.append(combined.numpy().flatten())
            else:
                # Fallback
                combined_features.append(np.zeros(32))

        # Создаем DataFrame
        combined_df = pd.DataFrame(
            combined_features,
            columns=[f'neural_feat_{i}' for i in range(len(combined_features[0]))]
        )
        combined_df['user_id'] = interaction_features['user_id'].values
        combined_df['item_id'] = interaction_features['item_id'].values

        return combined_df

    def _attention_combination(self, interaction_features, user_features, item_features):
        """Комбинация с механизмом внимания"""

        # Упрощенная реализация attention
        print("Using attention-based feature combination...")

        # Для простоты используем конкатенацию с дополнительными attention-признаками
        combined = self._concatenate_features(interaction_features, user_features, item_features)

        # Добавляем простые attention-like features
        # Например, важность пользовательских vs товарных признаков

        # Вычисляем "важность" признаков на основе вариации
        user_cols = [col for col in combined.columns if 'user' in col or 'age' in col or 'gender' in col]
        item_cols = [col for col in combined.columns if 'item' in col or 'price' in col or 'category' in col]

        if user_cols and item_cols:
            # Простой attention score
            user_importance = combined[user_cols].std(axis=1).mean()
            item_importance = combined[item_cols].std(axis=1).mean()

            total_importance = user_importance + item_importance + 1e-10

            combined['user_attention_weight'] = user_importance / total_importance
            combined['item_attention_weight'] = item_importance / total_importance

            # Взвешенные суммы
            combined['weighted_user_feature'] = combined[user_cols].mean(axis=1) * combined['user_attention_weight']
            combined['weighted_item_feature'] = combined[item_cols].mean(axis=1) * combined['item_attention_weight']

        return combined

    def train_model(self, features, target_column='rating', model_type='xgboost'):
        """Обучение модели на комбинированных признаках"""

        from sklearn.ensemble import RandomForestRegressor
        from sklearn.linear_model import LogisticRegression
        import xgboost as xgb
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_squared_error, accuracy_score

        print(f"\nTraining {model_type} model on combined features...")

        # Подготовка данных
        if target_column in features.columns:
            X = features.drop(columns=[target_column, 'user_id', 'item_id'], errors='ignore')
            y = features[target_column]
        else:
            # Если нет целевой переменной, создаем бинарную (взаимодействие было/не было)
            X = features.drop(columns=['user_id', 'item_id'], errors='ignore')
            y = np.ones(len(features))  # все взаимодействия положительные

            # Для баланса можно добавить negative sampling

        # Разделение на train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Выбор модели
        if model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=100, random_state=42)
        elif model_type == 'logistic':
            model = LogisticRegression(max_iter=1000, random_state=42)
            y_train = (y_train > 3).astype(int)  # бинарная классификация
            y_test = (y_test > 3).astype(int)
        elif model_type == 'xgboost':
            model = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                random_state=42,
                objective='reg:squarederror'
            )
        else:
            raise ValueError(f"Unknown model type: {model_type}")

        # Обучение
        model.fit(X_train, y_train)

        # Предсказание и оценка
        y_pred = model.predict(X_test)

        if model_type == 'logistic':
            y_pred_binary = (y_pred > 0.5).astype(int)
            accuracy = accuracy_score(y_test, y_pred_binary)
            print(f"Accuracy: {accuracy:.4f}")
            metric = accuracy
        else:
            mse = mean_squared_error(y_test, y_pred)
            print(f"MSE: {mse:.4f}")
            metric = mse

        # Сохраняем модель
        self.models[model_type] = {
            'model': model,
            'feature_names': list(X.columns),
            'performance': metric
        }

        return model, metric

    def recommend(self, user_id, item_ids, model_type='xgboost', top_k=5):
        """Генерация рекомендаций для пользователя"""

        if model_type not in self.models:
            raise ValueError(f"Model {model_type} not trained yet!")

        model_info = self.models[model_type]
        model = model_info['model']

        # Создаем фиктивные признаки для предсказания
        # В реальной системе здесь были бы реальные признаки
        predictions = []

        for item_id in item_ids:
            # Создаем вектор признаков для пары (user_id, item_id)
            # В реальной системе нужно извлекать реальные признаки
            feature_vector = np.random.randn(len(model_info['feature_names']))

            # Предсказание
            if hasattr(model, 'predict_proba'):
                pred = model.predict_proba([feature_vector])[0][1]
            else:
                pred = model.predict([feature_vector])[0]

            predictions.append((item_id, pred))

        # Сортируем по убыванию score
        predictions.sort(key=lambda x: x[1], reverse=True)

        return predictions[:top_k]

    def plot_feature_importance(self, model_type='xgboost'):
        """Визуализация важности признаков"""

        if model_type not in self.models:
            raise ValueError(f"Model {model_type} not trained yet!")

        model = self.models[model_type]['model']
        feature_names = self.models[model_type]['feature_names']

        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_

            # Сортируем по важности
            indices = np.argsort(importances)[::-1]

            # Берем топ-20 признаков
            top_n = min(20, len(feature_names))

            plt.figure(figsize=(10, 8))
            plt.title(f"Top {top_n} Feature Importances ({model_type})")
            plt.bar(range(top_n), importances[indices[:top_n]], align='center')
            plt.xticks(range(top_n), [feature_names[i] for i in indices[:top_n]], rotation=90)
            plt.tight_layout()
            plt.show()

            # Выводим топ-10 признаков
            print(f"\nTop 10 features for {model_type}:")
            for i in range(min(10, len(feature_names))):
                print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
        else:
            print(f"Model {model_type} doesn't have feature_importances_ attribute")

# ============================================
# 4. ДЕМОНСТРАЦИЯ РАБОТЫ
# ============================================

def main():
    print("=" * 60)
    print("FEATURE COMBINATION RECOMMENDER SYSTEM DEMO")
    print("=" * 60)

    # Генерация данных
    print("\n1. Generating synthetic data...")
    user_df, item_df, interactions_df, sessions_df = generate_synthetic_data(
        n_users=200, n_items=100, n_interactions=1000
    )

    print(f"   Users: {len(user_df)}")
    print(f"   Items: {len(item_df)}")
    print(f"   Interactions: {len(interactions_df)}")
    print(f"   Sessions: {len(sessions_df)}")

    # Тестирование разных методов комбинации
    methods = ['concatenate', 'weighted', 'neural', 'attention']

    results = {}

    for method in methods:
        print(f"\n{'='*60}")
        print(f"Testing {method.upper()} feature combination method")
        print('='*60)

        # Создаем рекомендательную систему
        recommender = FeatureCombinationRecommender(combination_method=method)

        # Подготавливаем признаки
        features = recommender.prepare_features(
            user_df, item_df, interactions_df, sessions_df
        )

        print(f"\nSample of combined features ({method}):")
        print(features.head())

        # Обучаем модель
        model, performance = recommender.train_model(
            features, target_column='rating', model_type='random_forest'
        )

        results[method] = {
            'n_features': len(features.columns),
            'performance': performance,
            'feature_sample': list(features.columns[:10])
        }

        # Генерация рекомендаций
        print(f"\nGenerating recommendations with {method} method...")
        recommendations = recommender.recommend(
            user_id=1,
            item_ids=list(range(1, 21)),
            model_type='random_forest',
            top_k=5
        )

        print("Top 5 recommendations:")
        for item_id, score in recommendations:
            print(f"  Item {item_id}: score = {score:.4f}")

    # Сравнение методов
    print(f"\n{'='*60}")
    print("COMPARISON OF FEATURE COMBINATION METHODS")
    print('='*60)

    comparison_df = pd.DataFrame([
        {
            'Method': method,
            'Features': results[method]['n_features'],
            'Performance': results[method]['performance'],
            'Sample Features': ', '.join(results[method]['feature_sample'][:3])
        }
        for method in methods
    ])

    print(comparison_df.to_string(index=False))

    # Визуализация сравнения
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.bar(comparison_df['Method'], comparison_df['Features'], color='skyblue')
    plt.title('Number of Features by Method')
    plt.ylabel('Feature Count')
    plt.xticks(rotation=45)

    plt.subplot(1, 2, 2)
    plt.bar(comparison_df['Method'], comparison_df['Performance'], color='lightcoral')
    plt.title('Model Performance by Method')
    plt.ylabel('Performance (Higher is better)')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

    # Дополнительный анализ важности признаков
    print(f"\n{'='*60}")
    print("FEATURE IMPORTANCE ANALYSIS")
    print('='*60)

    # Используем лучший метод для анализа
    best_method = comparison_df.loc[comparison_df['Performance'].idxmax(), 'Method']
    print(f"\nAnalyzing feature importance for best method: {best_method}")

    final_recommender = FeatureCombinationRecommender(combination_method=best_method)
    final_features = final_recommender.prepare_features(user_df, item_df, interactions_df, sessions_df)
    model, _ = final_recommender.train_model(final_features, model_type='random_forest')

    # Визуализация важности признаков
    final_recommender.plot_feature_importance(model_type='random_forest')

    # Пример использования для новых данных
    print(f"\n{'='*60}")
    print("PREDICTION EXAMPLE FOR NEW USER-ITEM PAIR")
    print('='*60)

    # Создаем пример нового пользователя и товара
    new_user_features = {
        'user_id': [999],
        'age': [30],
        'gender': ['M'],
        'location': ['NY'],
        'join_date': [pd.Timestamp.now()],
        'preferred_category': ['tech']
    }

    new_item_features = {
        'item_id': [999],
        'category': ['electronics'],
        'price': [299.99],
        'rating': [4.5],
        'reviews_count': [150],
        'description': ["new smartphone with advanced features"]
    }

    new_user_df = pd.DataFrame(new_user_features)
    new_item_df = pd.DataFrame(new_item_features)

    print("New user features:")
    print(new_user_df)
    print("\nNew item features:")
    print(new_item_df)

# Запуск демонстрации
if __name__ == "__main__":
    main()

All imports successful!
FEATURE COMBINATION RECOMMENDER SYSTEM DEMO

1. Generating synthetic data...
   Users: 200
   Items: 100
   Interactions: 1000
   Sessions: 556

Testing CONCATENATE feature combination method
Step 1: Extracting user features...
Step 2: Extracting item features...
Step 3: Extracting interaction features...


ValueError: Length of 'prefix' (4) did not match the length of the columns being encoded (1).

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Union
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")

# ============================================
# 1. ГЕНЕРАЦИЯ СИНТЕТИЧЕСКИХ ДАННЫХ (ИСПРАВЛЕННАЯ)
# ============================================

def generate_synthetic_data(n_users=1000, n_items=500, n_interactions=5000):
    """Генерация реалистичных синтетических данных"""

    np.random.seed(42)

    # Пользователи
    users = np.arange(1, n_users + 1)

    # Демографические признаки пользователей
    user_features = {
        'user_id': users,
        'age': np.random.randint(18, 65, n_users),
        'gender': np.random.choice(['M', 'F'], n_users, p=[0.55, 0.45]),
        'location': np.random.choice(['NY', 'LA', 'SF', 'CH', 'MI'], n_users),
        'join_date': pd.date_range('2020-01-01', periods=n_users, freq='D'),
        'preferred_category': np.random.choice(['tech', 'sports', 'music', 'movies', 'books'], n_users)
    }

    # Товары
    items = np.arange(1, n_items + 1)

    # Контентные признаки товаров
    categories = ['electronics', 'books', 'clothing', 'home', 'sports', 'beauty']
    item_features = {
        'item_id': items,
        'category': np.random.choice(categories, n_items),
        'price': np.random.uniform(5, 500, n_items).round(2),
        'rating': np.random.uniform(1, 5, n_items).round(1),
        'reviews_count': np.random.randint(0, 1000, n_items),
        'description': [f"product_{i} description with features and benefits" for i in items]
    }

    # Взаимодействия пользователь-товар
    interactions = []
    for _ in range(n_interactions):
        user = np.random.choice(users)
        item = np.random.choice(items)
        rating = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.2, 0.4, 0.25])
        timestamp = pd.Timestamp('2023-01-01') + pd.Timedelta(days=np.random.randint(0, 365))
        interactions.append([user, item, rating, timestamp])

    interactions_df = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'rating', 'timestamp'])

    # Контекстуальные данные (сессии)
    sessions = []
    for _ in range(min(100, n_interactions // 10)):  # ограничим количество сессий
        session_id = np.random.randint(1000, 9999)
        user = np.random.choice(users)
        session_items = np.random.choice(items, size=np.random.randint(2, 10), replace=False)
        for item in session_items:
            sessions.append([session_id, user, item])

    # Создаем DataFrames
    user_df = pd.DataFrame(user_features)
    item_df = pd.DataFrame(item_features)
    sessions_df = pd.DataFrame(sessions, columns=['session_id', 'user_id', 'item_id']) if sessions else pd.DataFrame()

    # Добавляем дополнительные вычисляемые признаки
    # Популярность товара
    item_popularity = interactions_df.groupby('item_id').size().reset_index(name='popularity')
    item_df = item_df.merge(item_popularity, on='item_id', how='left').fillna(0)

    # Активность пользователя
    user_activity = interactions_df.groupby('user_id').size().reset_index(name='activity_count')
    user_df = user_df.merge(user_activity, on='user_id', how='left').fillna(0)

    return user_df, item_df, interactions_df, sessions_df

# ============================================
# 2. ФУНКЦИИ FEATURE EXTRACTION (ИСПРАВЛЕННЫЕ)
# ============================================

class FeatureExtractor:
    """Извлечение признаков из разных источников"""

    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.tfidf_vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
        self.item_svd = TruncatedSVD(n_components=5)
        self.user_svd = TruncatedSVD(n_components=5)

    def extract_user_features(self, user_df: pd.DataFrame) -> pd.DataFrame:
        """Извлечение признаков пользователей"""

        # Копируем для безопасности
        user_features = user_df.copy()

        # 1. Демографические признаки (one-hot encoding)
        demo_features = pd.get_dummies(
            user_features[['gender', 'location', 'preferred_category']],
            prefix=['gender', 'loc', 'cat']
        )

        # 2. Числовые признаки (normalization)
        numeric_cols = ['age', 'activity_count']
        for col in numeric_cols:
            if col in user_features.columns:
                user_features[col] = user_features[col].astype(float)

        if numeric_cols and all(col in user_features.columns for col in numeric_cols):
            user_features[numeric_cols] = self.scaler.fit_transform(user_features[numeric_cols])

        # 3. Временные признаки из join_date
        if 'join_date' in user_features.columns:
            user_features['join_date'] = pd.to_datetime(user_features['join_date'])
            user_features['join_year'] = user_features['join_date'].dt.year
            user_features['join_month'] = user_features['join_date'].dt.month
            user_features['join_day'] = user_features['join_date'].dt.day
            user_features['days_since_join'] = (
                pd.Timestamp.now() - user_features['join_date']
            ).dt.days

        # 4. Объединяем все признаки пользователей
        numeric_features = ['user_id'] + numeric_cols
        if 'join_year' in user_features.columns:
            numeric_features.extend(['join_year', 'join_month', 'join_day', 'days_since_join'])

        final_features = pd.concat([
            user_features[numeric_features],
            demo_features
        ], axis=1)

        return final_features

    def extract_item_features(self, item_df: pd.DataFrame) -> pd.DataFrame:
        """Извлечение признаков товаров"""

        item_features = item_df.copy()

        # 1. Категориальные признаки (one-hot)
        cat_features = pd.get_dummies(
            item_features[['category']],
            prefix='cat'
        )

        # 2. Числовые признаки (normalization)
        numeric_cols = ['price', 'rating', 'reviews_count', 'popularity']
        numeric_cols = [col for col in numeric_cols if col in item_features.columns]

        for col in numeric_cols:
            item_features[col] = item_features[col].astype(float)

        if numeric_cols:
            item_features[numeric_cols] = self.scaler.fit_transform(item_features[numeric_cols])

        # 3. Текстовые признаки из описания (TF-IDF + SVD)
        if 'description' in item_features.columns:
            try:
                tfidf_matrix = self.tfidf_vectorizer.fit_transform(item_features['description'])
                svd_features = self.item_svd.fit_transform(tfidf_matrix)
                svd_cols = [f'item_svd_{i}' for i in range(svd_features.shape[1])]
                svd_df = pd.DataFrame(svd_features, columns=svd_cols)
            except:
                svd_df = pd.DataFrame()
        else:
            svd_df = pd.DataFrame()

        # 4. Объединяем все признаки товаров
        base_features = item_features[['item_id'] + numeric_cols] if numeric_cols else item_features[['item_id']]

        if not svd_df.empty:
            final_features = pd.concat([base_features, cat_features, svd_df], axis=1)
        else:
            final_features = pd.concat([base_features, cat_features], axis=1)

        return final_features

    def extract_interaction_features(self, interactions_df: pd.DataFrame,
                                    user_features: pd.DataFrame,
                                    item_features: pd.DataFrame) -> pd.DataFrame:
        """Извлечение признаков взаимодействий"""

        interactions = interactions_df.copy()

        # 1. Временные признаки
        if 'timestamp' in interactions.columns:
            interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])
            interactions['hour'] = interactions['timestamp'].dt.hour
            interactions['day_of_week'] = interactions['timestamp'].dt.dayofweek
            interactions['month'] = interactions['timestamp'].dt.month

        # 2. Признаки на основе рейтинга
        if 'rating' in interactions.columns:
            interactions['rating_bin'] = pd.cut(interactions['rating'],
                                               bins=[0, 2, 3, 4, 5],
                                               labels=['low', 'medium_low', 'medium_high', 'high'])

        # 3. One-hot encoding временных признаков (только для существующих колонок)
        time_cols_to_encode = []
        prefixes = []

        if 'hour' in interactions.columns:
            time_cols_to_encode.append('hour')
            prefixes.append('hour')
        if 'day_of_week' in interactions.columns:
            time_cols_to_encode.append('day_of_week')
            prefixes.append('dow')
        if 'month' in interactions.columns:
            time_cols_to_encode.append('month')
            prefixes.append('month')
        if 'rating_bin' in interactions.columns:
            time_cols_to_encode.append('rating_bin')
            prefixes.append('rating')

        if time_cols_to_encode:
            time_features = pd.get_dummies(
                interactions[time_cols_to_encode],
                prefix=prefixes if len(prefixes) == len(time_cols_to_encode) else None
            )
        else:
            time_features = pd.DataFrame()

        # 4. Объединяем с признаками пользователей и товаров
        combined = interactions.copy()

        if 'user_id' in user_features.columns and 'user_id' in combined.columns:
            combined = combined.merge(user_features, on='user_id', how='left')

        if 'item_id' in item_features.columns and 'item_id' in combined.columns:
            combined = combined.merge(item_features, on='item_id', how='left')

        # 5. Удаляем исходные столбцы
        columns_to_drop = ['timestamp', 'hour', 'day_of_week', 'month', 'rating_bin', 'rating']
        columns_to_drop = [col for col in columns_to_drop if col in combined.columns]

        if columns_to_drop:
            combined = combined.drop(columns=columns_to_drop, errors='ignore')

        # 6. Объединяем с временными признаками
        if not time_features.empty:
            combined = pd.concat([combined, time_features], axis=1)

        # Заполняем пропущенные значения
        combined = combined.fillna(0)

        return combined

    def extract_session_features(self, sessions_df: pd.DataFrame,
                               item_features: pd.DataFrame) -> pd.DataFrame:
        """Извлечение сессионных признаков"""

        if sessions_df.empty:
            return pd.DataFrame()

        sessions = sessions_df.copy()

        # 1. Агрегация на уровне сессии
        session_stats = sessions.groupby('session_id').agg({
            'item_id': ['count', lambda x: list(x)],
            'user_id': 'first'
        }).reset_index()

        session_stats.columns = ['session_id', 'session_length', 'item_sequence', 'user_id']

        # 2. Признаки сессии
        session_features = pd.DataFrame()
        session_features['session_id'] = session_stats['session_id']
        session_features['session_length'] = session_stats['session_length']

        # 3. Средние признаки товаров в сессии
        session_item_features = []
        for items in session_stats['item_sequence']:
            if item_features.empty or 'item_id' not in item_features.columns:
                session_item_features.append(np.zeros(5))  # fallback
                continue

            item_feats = item_features[item_features['item_id'].isin(items)]
            if len(item_feats) > 0:
                avg_features = item_feats.drop('item_id', axis=1).mean().values
            else:
                avg_features = np.zeros(item_features.shape[1] - 1)
            session_item_features.append(avg_features)

        # Создаем DataFrame с признаками товаров сессии
        if session_item_features:
            session_item_df = pd.DataFrame(
                session_item_features,
                columns=[f'session_item_{i}' for i in range(len(session_item_features[0]))]
            )

            # 4. Объединяем все сессионные признаки
            final_features = pd.concat([session_features, session_item_df], axis=1)
        else:
            final_features = session_features

        return final_features

# ============================================
# 3. КЛАСС ДЛЯ FEATURE COMBINATION (УПРОЩЕННЫЙ)
# ============================================

class FeatureCombinationRecommender:
    """
    Рекомендательная система с комбинацией признаков
    """

    def __init__(self, combination_method='concatenate'):
        self.combination_method = combination_method
        self.feature_extractor = FeatureExtractor()
        self.models = {}

    def prepare_features(self, user_df, item_df, interactions_df, sessions_df=None):
        """Подготовка и комбинация всех признаков"""

        print("Step 1: Extracting user features...")
        user_features = self.feature_extractor.extract_user_features(user_df)

        print("Step 2: Extracting item features...")
        item_features = self.feature_extractor.extract_item_features(item_df)

        print("Step 3: Extracting interaction features...")
        interaction_features = self.feature_extractor.extract_interaction_features(
            interactions_df, user_features, item_features
        )

        # Комбинируем признаки
        if self.combination_method == 'concatenate':
            combined_features = self._concatenate_features(
                interaction_features, user_features, item_features
            )
        elif self.combination_method == 'weighted':
            combined_features = self._weighted_combination(
                interaction_features, user_features, item_features
            )
        elif self.combination_method == 'attention':
            combined_features = self._attention_combination(
                interaction_features, user_features, item_features
            )
        else:
            raise ValueError(f"Unknown combination method: {self.combination_method}")

        # Добавляем сессионные признаки если есть
        if sessions_df is not None and not sessions_df.empty:
            print("Step 4: Extracting session features...")
            session_features = self.feature_extractor.extract_session_features(
                sessions_df, item_features
            )
            if not session_features.empty and 'user_id' in session_features.columns:
                combined_features = combined_features.merge(
                    session_features, on='user_id', how='left'
                ).fillna(0)

        # Сохраняем информацию о признаках
        self.feature_info = {
            'user_features': list(user_features.columns),
            'item_features': list(item_features.columns),
            'interaction_features': list(interaction_features.columns),
            'combined_features': list(combined_features.columns)
        }

        print(f"\nFeature extraction complete!")
        print(f"  User features: {len(user_features.columns)}")
        print(f"  Item features: {len(item_features.columns)}")
        print(f"  Interaction features: {len(interaction_features.columns)}")
        print(f"  Total combined features: {len(combined_features.columns)}")

        return combined_features

    def _concatenate_features(self, interaction_features, user_features, item_features):
        """Простая конкатенация всех признаков"""

        # Объединяем все признаки
        combined = interaction_features.copy()

        if 'user_id' in combined.columns:
            combined = combined.merge(user_features, on='user_id', how='left')

        if 'item_id' in combined.columns:
            combined = combined.merge(item_features, on='item_id', how='left')

        # Заполняем пропуски
        combined = combined.fillna(0)

        # Удаляем возможные дубликаты колонок
        combined = combined.loc[:, ~combined.columns.duplicated()]

        return combined

    def _weighted_combination(self, interaction_features, user_features, item_features):
        """Взвешенная комбинация признаков"""

        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler()

        # Получаем признаки отдельно
        interaction_only = interaction_features.drop(['user_id', 'item_id'], axis=1, errors='ignore')
        user_only = user_features.drop(['user_id'], axis=1, errors='ignore')
        item_only = item_features.drop(['item_id'], axis=1, errors='ignore')

        # Определяем веса
        interaction_weight = 0.4
        user_weight = 0.3
        item_weight = 0.3

        # Нормализуем признаки если они есть
        weighted_features = []

        for i in range(len(interaction_features)):
            user_id = interaction_features.iloc[i]['user_id']
            item_id = interaction_features.iloc[i]['item_id']

            # Получаем векторы признаков
            inter_vec = interaction_only.iloc[i].values if not interaction_only.empty else np.array([])

            user_vec = np.array([])
            if not user_only.empty and user_id in user_features['user_id'].values:
                user_idx = user_features[user_features['user_id'] == user_id].index
                if len(user_idx) > 0:
                    user_vec = user_only.iloc[user_idx[0]].values

            item_vec = np.array([])
            if not item_only.empty and item_id in item_features['item_id'].values:
                item_idx = item_features[item_features['item_id'] == item_id].index
                if len(item_idx) > 0:
                    item_vec = item_only.iloc[item_idx[0]].values

            # Нормализуем каждый вектор отдельно
            if len(inter_vec) > 0:
                inter_vec = scaler.fit_transform(inter_vec.reshape(1, -1)).flatten()
            if len(user_vec) > 0:
                user_vec = scaler.fit_transform(user_vec.reshape(1, -1)).flatten()
            if len(item_vec) > 0:
                item_vec = scaler.fit_transform(item_vec.reshape(1, -1)).flatten()

            # Взвешенная комбинация
            if len(inter_vec) > 0 and len(user_vec) > 0 and len(item_vec) > 0:
                # Приводим к одинаковой размерности (берем минимальную)
                min_len = min(len(inter_vec), len(user_vec), len(item_vec))
                combined = (
                    inter_vec[:min_len] * interaction_weight +
                    user_vec[:min_len] * user_weight +
                    item_vec[:min_len] * item_weight
                )
            else:
                # Используем доступные признаки
                available_vecs = []
                weights = []
                if len(inter_vec) > 0:
                    available_vecs.append(inter_vec)
                    weights.append(interaction_weight)
                if len(user_vec) > 0:
                    available_vecs.append(user_vec)
                    weights.append(user_weight)
                if len(item_vec) > 0:
                    available_vecs.append(item_vec)
                    weights.append(item_weight)

                if available_vecs:
                    # Нормализуем веса
                    total_weight = sum(weights)
                    weights = [w/total_weight for w in weights]

                    # Находим минимальную длину
                    min_len = min(len(v) for v in available_vecs)
                    combined = np.zeros(min_len)
                    for vec, weight in zip(available_vecs, weights):
                        combined += vec[:min_len] * weight
                else:
                    combined = np.array([0.5])  # fallback

            weighted_features.append(combined)

        # Создаем DataFrame
        max_len = max(len(f) for f in weighted_features)
        padded_features = []
        for f in weighted_features:
            if len(f) < max_len:
                f_padded = np.pad(f, (0, max_len - len(f)), mode='constant')
            else:
                f_padded = f[:max_len]
            padded_features.append(f_padded)

        weighted_df = pd.DataFrame(
            padded_features,
            columns=[f'weighted_feat_{i}' for i in range(max_len)]
        )

        # Добавляем ID
        weighted_df['user_id'] = interaction_features['user_id'].values
        weighted_df['item_id'] = interaction_features['item_id'].values

        return weighted_df

    def _attention_combination(self, interaction_features, user_features, item_features):
        """Комбинация с упрощенным механизмом внимания"""

        print("Using simplified attention-based feature combination...")

        # Для простоты используем конкатенацию
        combined = self._concatenate_features(interaction_features, user_features, item_features)

        # Вычисляем простые статистики для создания attention-like features
        numeric_cols = combined.select_dtypes(include=[np.number]).columns.tolist()

        if numeric_cols:
            # Вычисляем важность на основе стандартного отклонения
            combined['feature_variance'] = combined[numeric_cols].std(axis=1)
            combined['feature_mean'] = combined[numeric_cols].mean(axis=1)

            # Простой attention weight
            combined['attention_weight'] = (
                combined['feature_variance'] /
                (combined['feature_mean'].abs() + 1e-10)
            )

            # Нормализуем weight
            if combined['attention_weight'].std() > 0:
                combined['attention_weight'] = (
                    (combined['attention_weight'] - combined['attention_weight'].mean()) /
                    combined['attention_weight'].std()
                )

        return combined

    def train_model(self, features, target_column='rating', model_type='random_forest'):
        """Обучение модели на комбинированных признаках"""

        from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_squared_error, accuracy_score

        print(f"\nTraining {model_type} model on combined features...")

        # Подготовка данных
        X = features.copy()

        # Удаляем ID колонки
        id_cols = ['user_id', 'item_id', 'session_id']
        id_cols = [col for col in id_cols if col in X.columns]

        if id_cols:
            X = X.drop(columns=id_cols)

        # Проверяем наличие целевой переменной
        if target_column in features.columns:
            y = features[target_column]
            is_classification = False
        else:
            # Создаем бинарную целевую переменную
            y = np.ones(len(features))
            is_classification = True

        # Проверяем что есть данные для обучения
        if len(X) == 0 or len(y) == 0:
            print("Warning: No data for training!")
            return None, 0

        # Разделение на train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y if is_classification else None
        )

        # Выбор модели
        if model_type == 'random_forest':
            if is_classification:
                model = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=5)
                y_train = (y_train > 0).astype(int)
                y_test = (y_test > 0).astype(int)
            else:
                model = RandomForestRegressor(n_estimators=50, random_state=42, max_depth=5)
        elif model_type == 'logistic':
            model = LogisticRegression(max_iter=500, random_state=42)
            y_train = (y_train > 3).astype(int) if not is_classification else y_train
            y_test = (y_test > 3).astype(int) if not is_classification else y_test
        else:
            # Fallback to random forest
            model = RandomForestRegressor(n_estimators=50, random_state=42, max_depth=5)

        # Обучение
        try:
            model.fit(X_train, y_train)

            # Предсказание и оценка
            y_pred = model.predict(X_test)

            if is_classification or model_type == 'logistic':
                accuracy = accuracy_score(y_test, y_pred)
                print(f"Accuracy: {accuracy:.4f}")
                metric = accuracy
            else:
                mse = mean_squared_error(y_test, y_pred)
                print(f"MSE: {mse:.4f}")
                metric = mse

            # Сохраняем модель
            self.models[model_type] = {
                'model': model,
                'feature_names': list(X.columns),
                'performance': metric
            }

            return model, metric

        except Exception as e:
            print(f"Error training model: {e}")
            return None, 0

    def plot_feature_importance(self, model_type='random_forest'):
        """Визуализация важности признаков"""

        if model_type not in self.models:
            print(f"Model {model_type} not trained yet!")
            return

        model = self.models[model_type]['model']
        feature_names = self.models[model_type]['feature_names']

        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_

            if len(importances) == 0:
                print("No feature importances available")
                return

            # Сортируем по важности
            indices = np.argsort(importances)[::-1]

            # Берем топ-15 признаков
            top_n = min(15, len(feature_names), len(importances))

            plt.figure(figsize=(10, 6))
            plt.title(f"Top {top_n} Feature Importances ({model_type})")
            bars = plt.bar(range(top_n), importances[indices[:top_n]], align='center', color='skyblue')
            plt.xticks(range(top_n), [feature_names[i] for i in indices[:top_n]], rotation=45, ha='right')
            plt.xlabel('Features')
            plt.ylabel('Importance')

            # Добавляем значения на бары
            for bar, importance in zip(bars, importances[indices[:top_n]]):
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
                        f'{importance:.3f}', ha='center', va='bottom', fontsize=8)

            plt.tight_layout()
            plt.show()

            # Выводим топ-5 признаков
            print(f"\nTop 5 features for {model_type}:")
            for i in range(min(5, len(feature_names), len(importances))):
                print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
        else:
            print(f"Model {model_type} doesn't have feature_importances_ attribute")

# ============================================
# 4. ДЕМОНСТРАЦИЯ РАБОТЫ
# ============================================

def main():
    print("=" * 60)
    print("FEATURE COMBINATION RECOMMENDER SYSTEM DEMO")
    print("=" * 60)

    # Генерация данных
    print("\n1. Generating synthetic data...")
    user_df, item_df, interactions_df, sessions_df = generate_synthetic_data(
        n_users=100, n_items=50, n_interactions=500
    )

    print(f"   Users: {len(user_df)}")
    print(f"   Items: {len(item_df)}")
    print(f"   Interactions: {len(interactions_df)}")
    print(f"   Sessions: {len(sessions_df)}")

    # Покажем данные
    print("\nSample user data:")
    print(user_df.head())

    print("\nSample item data:")
    print(item_df.head())

    print("\nSample interactions:")
    print(interactions_df.head())

    # Тестирование методов комбинации
    methods = ['concatenate', 'weighted', 'attention']

    results = {}

    for method in methods:
        print(f"\n{'='*60}")
        print(f"Testing {method.upper()} feature combination method")
        print('='*60)

        # Создаем рекомендательную систему
        recommender = FeatureCombinationRecommender(combination_method=method)

        # Подготавливаем признаки
        features = recommender.prepare_features(
            user_df, item_df, interactions_df, sessions_df
        )

        print(f"\nSample of combined features ({method}):")
        print(features.head(3))
        print(f"Shape: {features.shape}")

        # Обучаем модель
        if len(features) > 10:  # Минимум данных для обучения
            model, performance = recommender.train_model(
                features, target_column='rating', model_type='random_forest'
            )

            if model is not None:
                results[method] = {
                    'n_features': len(features.columns),
                    'performance': performance,
                    'n_samples': len(features),
                    'feature_sample': list(features.columns[:5])
                }

                # Визуализация важности признаков
                recommender.plot_feature_importance(model_type='random_forest')

    # Сравнение методов
    if results:
        print(f"\n{'='*60}")
        print("COMPARISON OF FEATURE COMBINATION METHODS")
        print('='*60)

        comparison_data = []
        for method in methods:
            if method in results:
                comparison_data.append({
                    'Method': method,
                    'Features': results[method]['n_features'],
                    'Performance': f"{results[method]['performance']:.4f}",
                    'Samples': results[method]['n_samples'],
                    'Sample Features': ', '.join(results[method]['feature_sample'])
                })

        if comparison_data:
            comparison_df = pd.DataFrame(comparison_data)
            print(comparison_df.to_string(index=False))

            # Визуализация сравнения
            fig, axes = plt.subplots(1, 3, figsize=(15, 5))

            methods_list = [d['Method'] for d in comparison_data]
            features_list = [d['Features'] for d in comparison_data]
            performance_list = [float(d['Performance']) for d in comparison_data]
            samples_list = [d['Samples'] for d in comparison_data]

            axes[0].bar(methods_list, features_list, color='skyblue')
            axes[0].set_title('Number of Features')
            axes[0].set_ylabel('Count')

            axes[1].bar(methods_list, performance_list, color='lightcoral')
            axes[1].set_title('Model Performance')
            axes[1].set_ylabel('Score')

            axes[2].bar(methods_list, samples_list, color='lightgreen')
            axes[2].set_title('Number of Samples')
            axes[2].set_ylabel('Count')

            for ax in axes:
                ax.tick_params(axis='x', rotation=45)

            plt.tight_layout()
            plt.show()

    print(f"\n{'='*60}")
    print("DEMO COMPLETE!")
    print("=" * 60)

# Запуск демонстрации
if __name__ == "__main__":
    main()

All imports successful!
FEATURE COMBINATION RECOMMENDER SYSTEM DEMO

1. Generating synthetic data...
   Users: 100
   Items: 50
   Interactions: 500
   Sessions: 267

Sample user data:
   user_id  age gender location  join_date preferred_category  activity_count
0        1   56      M       LA 2020-01-01               tech             2.0
1        2   46      M       SF 2020-01-02             sports             6.0
2        3   32      M       SF 2020-01-03             sports             4.0
3        4   60      F       CH 2020-01-04              music             7.0
4        5   25      F       CH 2020-01-05             sports            10.0

Sample item data:
   item_id     category   price  rating  reviews_count  \
0        1         home  343.09     3.0            148   
1        2  electronics   40.24     1.2             79   
2        3       beauty  162.89     3.2            885   
3        4       beauty  423.21     2.8            212   
4        5       sports   16.52     4.

ValueError: Length of 'prefix' (4) did not match the length of the columns being encoded (1).