# Инициализация данных

In [1]:
import os
import pandas as pd

from datetime import datetime
from catboost import CatBoostClassifier
from loguru import logger

from dotenv import load_dotenv

import hashlib
from pydantic import BaseModel
from typing import List

load_dotenv()

True

In [2]:
salt = 'get_exp_group'
num_groups = 2

In [3]:
# Разбиение пользователей на группы
def get_exp_group(user_id: int) -> str:
    exp_group = int(hashlib.md5((str(user_id) + salt).encode()).hexdigest(), 16) % num_groups
    
    if exp_group == 0:
        return 'control'
    else:
        return 'test'

In [4]:
get_exp_group(200)

'control'

In [5]:
class PostGet(BaseModel):
    id: int
    text: str
    topic: str
    
    class Config:
        orm_mode = True

class Response(BaseModel):
    exp_group: str
    recommendations: List[PostGet]

* 'orm_mode' has been renamed to 'from_attributes'


In [6]:
# Путь модели
def get_model_path(model_name: str) -> str:
    if os.environ.get("IS_LMS") == "1":
        if model_name == 'model_control':
            MODEL_PATH = '/workdir/user_input/model_control'
        elif model_name == 'model_test':
            MODEL_PATH = '/workdir/user_input/model_test'
        else:
            raise ValueError('unknown name')
    
    else:
        if model_name == 'model_control':
            MODEL_PATH = '/Users/dmitry/Documents/code/Start_ML/module_2/final_project/model/catboost_ml_1'
        elif model_name == 'model_test':
            MODEL_PATH = '/Users/dmitry/Documents/code/Start_ML/module_2/final_project/model/catboost_dl_3'
        else:
            raise ValueError('unknown name')
    return MODEL_PATH


# Загрузка модели
def load_models(model_name: str):
    model_path = get_model_path(model_name)

    from_file = CatBoostClassifier()
    model = from_file.load_model(model_path, format='cbm')
    return model

# Основная функция загрузки признаков из БД
def load_features():
    # Признаки по постам (созданные) TF-IDF
    logger.info("loading tf-idf post feachures")
    post_feachures_control = pd.read_sql(
        """
        SELECT *
        FROM d_trubitsin_post_feachures_base 
        """,
        con=os.environ["POSTGRES_CONN"]
    )

    # Признаки по постам (созданные) DL
    logger.info("loading dl post feachures")
    post_feachures_test = pd.read_sql(
        """
        SELECT *
        FROM d_trubitsin_post_feachures_dl 
        """,
        con=os.environ["POSTGRES_CONN"]
    )

    # Признаки по пользователям
    logger.info("loading user feachures")
    user_feachures = pd.read_sql(
        """
        SELECT *
        FROM public.user_data
        """,
        con=os.environ["POSTGRES_CONN"]
    )
    return [post_feachures_control, post_feachures_test, user_feachures]

In [7]:
logger.info("loading model")
model_control = load_models("model_control")
model_test = load_models("model_test")

[32m2024-04-17 21:12:21.147[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mloading model[0m


In [10]:
logger.info("loading feachures")
feachures = load_features()
logger.info("service is up and running")

[32m2024-04-17 21:12:29.236[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mloading feachures[0m
[32m2024-04-17 21:12:29.237[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_features[0m:[36m32[0m - [1mloading tf-idf post feachures[0m
[32m2024-04-17 21:12:33.385[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_features[0m:[36m42[0m - [1mloading dl post feachures[0m
[32m2024-04-17 21:12:37.097[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_features[0m:[36m52[0m - [1mloading user feachures[0m
[32m2024-04-17 21:12:41.350[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mservice is up and running[0m


# Работа endpoint

In [11]:
id = 2001
time = datetime(year=2021, month=11, day=3, hour=14)
limit = 5

In [12]:
def get_recommendations(id, time, model, post_feachures, user_feachures):
   
    # Признаки пользователя
    logger.info(f"user_id: {id}")
    logger.info("reading feachures")
    user_feach = user_feachures.loc[user_feachures["user_id"] == id].copy()
    user_feach.drop(['user_id', 'os', 'source'], axis=1, inplace=True)

    # Признаки постов
    logger.info("dropping columns")
    post_feach = post_feachures.drop(["index", "text"], axis=1)
    content = post_feachures[["post_id", "text", "topic"]]

    # Объединение признаков
    logger.info("zipping everything")
    add_user_feachures = dict(
        zip(user_feach.columns, user_feach.values[0]))
    request_data = post_feach.assign(**add_user_feachures)
    request_data = request_data.set_index('post_id')

    # Добавление даты
    logger.info("adding time info")
    request_data['weekday'] = time.weekday()
    request_data['hour'] = time.hour

    # Предсказание вероятности для 1 класса (вероятнее всего понравится)
    logger.info("predicting")
    probabilities = model.predict_proba(request_data)[:, 1]
    request_data['prediction'] = probabilities

    # Получение топ-5 индексов вероятностей
    recommended_posts = request_data.sort_values("prediction")[-limit:].index

    return [PostGet(**{
        'id': i,
        'text': content[content['post_id'] == i]['text'].values[0],
        'topic': content[content['post_id'] == i]['topic'].values[0]
        }) for i in recommended_posts]


In [15]:
def recommended_posts(id, time):
    # Отнесение пользователя к группе
    logger.info('getting exp group')
    exp_group = get_exp_group(id)
    logger.info(f"User '{id}' assigned to '{exp_group}'")

    if exp_group == 'control':
        recommendations = get_recommendations(id, time, model_control, feachures[0], feachures[2])
    elif exp_group == 'test':
        recommendations = get_recommendations(id, time, model_test, feachures[1], feachures[2])
    else:
        raise ValueError('unknown group')
    
    return Response(exp_group=exp_group, recommendations=recommendations).model_dump_json()

In [16]:
recommended_posts(id, time)

[32m2024-04-17 21:13:28.150[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m3[0m - [1mgetting exp group[0m
[32m2024-04-17 21:13:28.151[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m5[0m - [1mUser '2001' assigned to 'control'[0m
[32m2024-04-17 21:13:28.153[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_recommendations[0m:[36m4[0m - [1muser_id: 2001[0m
[32m2024-04-17 21:13:28.154[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_recommendations[0m:[36m5[0m - [1mreading feachures[0m
[32m2024-04-17 21:13:28.156[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_recommendations[0m:[36m10[0m - [1mdropping columns[0m
[32m2024-04-17 21:13:28.159[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_recommendations[0m:[36m15[0m - [1mzipping everything[0m
[32m2024-04-17 21:13:28.165[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_recommendations[0m:[36m22[0m - [1madding time info[0m
[32m2024-

'{"exp_group":"control","recommendations":[{"id":6713,"text":"The Seven-Ups is a good and engrossing film. Its packed with credible performances by Scheider, LaBianco and an effective scary performance by Richard Lynch - although most of the characters are card-board cut-out tough guys. Character development does not evolve at all on the screen. The only thing we know is the good guys are the good guys and the bad guys are bad. Deviating from the crime story norm, The Seven-Ups manage to throw Scheider and crew into the middle of a building plot in a unique writing twist. Onsite locations of New York City and an excellent choreographed car chase highlight the film. The only downside of the film is the slightly confusing plot line in the beginning. They give the viewer little evidence that the men being kidnapped are mob related (until later in the film). Had someone blindly started watching the film may be slightly confused on the story. Otherwise, The Seven-Ups is a gritty, testostero

In [122]:
def recommended_posts(
        id: int,
        time: datetime,
        limit: int = 5):

    # Признаки пользователя
    logger.info(f"user_id: {id}")
    logger.info("reading feachures")
    user_feachures = feachures[2].loc[feachures[2]["user_id"] == id].copy()
    user_feachures.drop('user_id', axis=1, inplace=True)

    # Признаки постов
    logger.info("dropping columns")
    post_feachures = feachures[1].drop(["index", "text"], axis=1)
    content = feachures[1][["post_id", "text", "topic"]]

    # Объединение признаков
    logger.info("zipping everything")
    request_data = pd.merge(
        user_feachures, post_feachures, how='cross')
    request_data = request_data.set_index("post_id")

    # Добавление даты
    logger.info("adding time info")
    request_data['weekday'] = time.weekday()
    request_data['hour'] = time.hour


    # Предсказание вероятности для 1 класса (вероятнее всего понравится)
    logger.info("predicting")
    probabilities = model.predict_proba(request_data)[:, 1]
    request_data['prediction'] = probabilities

    # Убираем посты, где пользователь уже поставил "like"
    logger.info("deleting liked posts")
    liked_posts = feachures[0]
    liked_posts = liked_posts[liked_posts["user_id"] == id]["post_id"].values
    filtered_ = request_data[~request_data.index.isin(liked_posts)]

    # Получение топ-5 индексов вероятностей
    recommended_posts = filtered_.sort_values("prediction")[-limit:].index
    response = content.iloc[recommended_posts].copy()

    # Форматирование выдачи согласно шаблону
    response.rename(columns={'post_id': 'id'}, inplace=True)
    result_list = response.to_dict(orient='records')

    return result_list

In [123]:
recommended_posts(id, time)

[32m2024-04-16 18:09:42.033[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m7[0m - [1muser_id: 200[0m
[32m2024-04-16 18:09:42.034[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m8[0m - [1mreading feachures[0m


IndexError: list index out of range