In [1]:
import pandas as pd
import dill
import re
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

class ModelInferenceDemo:
    """Эмулирует инференс модели по CSV с фильтрацией по user_id."""

    def __init__(self, model_path, columns_to_drop=None):
        """
        model_path: путь к сериализованному пайплайну (dill pkl).
        columns_to_drop: список столбцов для удаления перед инференсом.
        """
        self.model_path = model_path
        self.pipeline = self._load_pipeline(model_path)
        self.columns_to_drop = columns_to_drop or [
            'user_id', 'course_id', 'real_course_progress', 'course_success'
        ]

    def _load_pipeline(self, path):
        """Загружает пайплайн модели из файла."""
        with open(path, 'rb') as f:
            logger.info(f"Загружаю модель из {path}")
            return dill.load(f)

    def _drop_unused_columns(self, df):
        """Удаляет risk_status_{n}_week, 'risk_status', 'week' и лишние столбцы."""
        week_pattern = re.compile(r'^risk_status_\d+_week$')
        cols_to_drop = [col for col in df.columns if week_pattern.match(col)]
        for col in ['risk_status', 'week']:
            if col in df.columns:
                cols_to_drop.append(col)
        for col in self.columns_to_drop:
            if col in df.columns and col not in cols_to_drop:
                cols_to_drop.append(col)
        if cols_to_drop:
            logger.info(f"Удаляю столбцы: {cols_to_drop}")
        return df.drop(columns=cols_to_drop, errors='ignore')

    def predict_for_user(self, csv_path, user_id):
        """
        Выполняет инференс для всех записей с заданным user_id в CSV.
        Возвращает DataFrame с предсказаниями и вероятностями.
        """
        df = pd.read_csv(csv_path)
        logger.info(f"Загружено {df.shape[0]} строк, {df.shape[1]} столбцов.")
        df_user = df[df['user_id'] == user_id]
        if df_user.empty:
            logger.warning(f"user_id {user_id} не найден в датасете.")
            return None
        logger.info(f"Найдено {df_user.shape[0]} строк для user_id={user_id}")

        X_input = self._drop_unused_columns(df_user)
        if X_input.empty:
            logger.warning("Нет признаков для инференса после удаления столбцов.")
            return None

        y_pred = self.pipeline.predict(X_input)
        y_pred_proba = None
        if hasattr(self.pipeline, 'predict_proba'):
            try:
                y_pred_proba = self.pipeline.predict_proba(X_input)
            except Exception:
                y_pred_proba = None

        result_df = df_user.copy()
        result_df['predicted_risk_status'] = y_pred
        if y_pred_proba is not None:
            for i, class_label in enumerate(self.pipeline.classes_):
                result_df[f'proba_{class_label}'] = y_pred_proba[:, i]

        display_cols = ['user_id']
        if 'week' in result_df.columns:
            display_cols.append('week')
        display_cols.append('predicted_risk_status')
        proba_cols = [col for col in result_df.columns if col.startswith('proba_')]

        logger.info("Пример результата инференса:")
        display(result_df[display_cols + proba_cols])
        return result_df


In [2]:
MODEL_PATH = './saved_models/model.pkl'  # путь к модели
CSV_PATH = './original_dataset/13_week_bas.csv'
USER_ID = 35916

demo = ModelInferenceDemo(model_path=MODEL_PATH)
result = demo.predict_for_user(CSV_PATH, USER_ID)

if result is not None:
    display(result)
else:
    print(f'Нет данных для user_id={USER_ID}')


2025-07-04 21:26:10,889 - INFO - Загружаю модель из ./saved_models/model.pkl
2025-07-04 21:26:21,122 - INFO - Загружено 2157 строк, 277 столбцов.
2025-07-04 21:26:21,124 - INFO - Найдено 1 строк для user_id=35916
2025-07-04 21:26:21,124 - INFO - Удаляю столбцы: ['risk_status_1_week', 'risk_status_2_week', 'risk_status_3_week', 'risk_status_4_week', 'risk_status_5_week', 'risk_status_6_week', 'risk_status_7_week', 'risk_status_8_week', 'risk_status_9_week', 'risk_status_10_week', 'risk_status_11_week', 'risk_status_12_week', 'risk_status_13_week', 'user_id', 'course_id', 'real_course_progress', 'course_success']
2025-07-04 21:26:21,308 - INFO - Пример результата инференса:


Unnamed: 0,user_id,predicted_risk_status,proba_0,proba_1,proba_2,proba_3,proba_4
163,35916,4,0.115624,0.17008,0.083059,0.311003,0.320234


Unnamed: 0,user_id,course_id,all_activities_delay_1_week,all_activities_delay_2_week,all_activities_delay_3_week,all_activities_delay_4_week,all_activities_delay_5_week,all_activities_delay_6_week,all_activities_delay_7_week,all_activities_delay_8_week,...,risk_status_12_week,risk_status_13_week,real_course_progress,course_success,predicted_risk_status,proba_0,proba_1,proba_2,proba_3,proba_4
163,35916,84,7.0,3.305556,6.139535,8.928058,11.056122,17.035533,22.964467,28.497462,...,3,3,38,0,4,0.115624,0.17008,0.083059,0.311003,0.320234
