# Инициализация данных

In [60]:
import os
import pandas as pd
import numpy as np
import catboost

from datetime import datetime
from catboost import CatBoostClassifier
from loguru import logger

from dotenv import load_dotenv


load_dotenv()

True

In [61]:
# Путь модели
def get_model_path(path: str) -> str:
    if os.environ.get("IS_LMS") == "1":
        MODEL_PATH = '/workdir/user_input/model'
    else:
        MODEL_PATH = path
    return MODEL_PATH


# Загрузка модели
def load_models():
    model_path = get_model_path(
        "/Users/dmitry/Documents/code/Start_ML/module_2/final_project/model/catboost_1_1")

    from_file = CatBoostClassifier()
    model = from_file.load_model(model_path, format='cbm')
    return model


# Основная функция загрузки признаков из БД
def load_features():
    # Признаки по постам (созданные)
    logger.info("loading post feachures")
    post_feachures = pd.read_sql(
        """
        SELECT *
        FROM d_trubitsin_post_feachures_base 
        """,
        con=os.environ["POSTGRES_CONN"]
    )

    # Признаки по пользователям
    logger.info("loading user feachures")
    user_feachures = pd.read_sql(
        """
        SELECT *
        FROM public.user_data
        """,
        con=os.environ["POSTGRES_CONN"]
    )
    return [post_feachures, user_feachures]

In [62]:
logger.info("loading model")
model = load_models()

[32m2024-04-14 20:40:35.908[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mloading model[0m


In [63]:
logger.info("loading feachures")
feachures = load_features()
logger.info("service is up and running")

[32m2024-04-14 20:40:36.837[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mloading feachures[0m
[32m2024-04-14 20:40:36.838[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_features[0m:[36m23[0m - [1mloading post feachures[0m
[32m2024-04-14 20:40:43.007[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_features[0m:[36m33[0m - [1mloading user feachures[0m
[32m2024-04-14 20:40:50.218[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mservice is up and running[0m


# Работа endpoint

In [64]:
id = 200
time = datetime(year=2021, month=11, day=3, hour=14)
limit = 5

In [65]:
# Признаки пользователя
logger.info(f"user_id: {id}")
logger.info("reading feachures")
user_feachures = feachures[1].loc[feachures[1]["user_id"] == id].copy()
user_feachures.drop('user_id', axis=1, inplace=True)

# Признаки постов
logger.info("dropping columns")
post_feachures = feachures[0].drop(["index", "text"], axis=1)
content = feachures[0][["post_id", "text", "topic"]]

# Объединение признаков
logger.info("zipping everything")
add_user_feachures = dict(
    zip(user_feachures.columns, user_feachures.values[0]))
request_data = post_feachures.assign(**add_user_feachures)
request_data = request_data.set_index('post_id')

# Добавление даты
logger.info("adding time info")
request_data['weekday'] = time.weekday()
request_data['hour'] = time.hour

# Предсказание вероятности для 1 класса (вероятнее всего понравится)    logger.info("predicting")
probabilities = model.predict_proba(request_data)[:, 1]
request_data['prediction'] = probabilities

# Получение топ-5 индексов вероятностей
recommended_posts = request_data.sort_values("prediction")[-limit:].index

[dict(**{
    'id': i,
    'text': content[content['post_id'] == i]['text'].values[0],
    'topic': content[content['post_id'] == i]['topic'].values[0]
}) for i in recommended_posts
]

[32m2024-04-14 20:40:50.241[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1muser_id: 200[0m
[32m2024-04-14 20:40:50.243[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mreading feachures[0m
[32m2024-04-14 20:40:50.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mdropping columns[0m
[32m2024-04-14 20:40:50.256[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mzipping everything[0m
[32m2024-04-14 20:40:50.270[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1madding time info[0m


[{'id': 315,
  'text': 'US trade gap hits record in 2004\n\nThe gap between US exports and imports hit an all-time high of $671.7bn (£484bn) in 2004, latest figures show.\n\nThe Commerce Department said the trade deficit for all of last year was 24.4% above the previous record - 2003s imbalance of $496.5bn. The deficit with China, up 30.5% at $162bn, was the largest ever recorded with a single country. However, on a monthly basis the US trade gap narrowed by 4.9% in December to £56.4bn. The US consumers appetite for all things from oil to imported cars, and even wine and cheese, reached record levels last year and the figures are likely to spark fresh criticism of President Bushs economic policies.\n\nDemocrats claim the administration has not done enough to clamp down on unfair foreign trade practices. For example, they believe Chinas currency policy - which US manufacturers claim has undervalued the yuan by as much as 40% - has given Chinas rapidly expanding economy an unfair advanta

In [43]:
probabilities = model.predict_proba(request_data)[:, 1]
request_data['prediction'] = probabilities

In [45]:
liked_posts = feachures[0]
liked_posts = liked_posts[liked_posts["user_id"] == id]["post_id"].values
filtered_ = request_data[~request_data.index.isin(liked_posts)]
filtered_

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source,topic,TextCluster,DistanceTo1thCluster,...,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,weekday,hour,prediction
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,34,Russia,Degtyarsk,3,Android,ads,business,4,0.493441,...,0.449122,0.327339,0.495972,0.441822,0.454048,0.519623,0.466507,2,14,0.123177
2,1,34,Russia,Degtyarsk,3,Android,ads,business,4,0.364252,...,0.307292,0.153631,0.368690,0.297829,0.291786,0.251996,0.331836,2,14,0.069526
3,1,34,Russia,Degtyarsk,3,Android,ads,business,4,0.388896,...,0.333451,0.174627,0.383497,0.330572,0.340945,0.462773,0.366570,2,14,0.157958
4,1,34,Russia,Degtyarsk,3,Android,ads,business,4,0.324394,...,0.272601,0.111932,0.344391,0.291741,0.280724,0.409760,0.323763,2,14,0.135661
5,1,34,Russia,Degtyarsk,3,Android,ads,business,4,0.287583,...,0.197349,0.124815,0.288968,0.226114,0.234444,0.372978,0.304076,2,14,0.096985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,1,34,Russia,Degtyarsk,3,Android,ads,movie,5,0.412668,...,0.355920,0.359065,0.123385,0.353078,0.397781,0.479191,0.435138,2,14,0.129874
7316,1,34,Russia,Degtyarsk,3,Android,ads,movie,5,0.372153,...,0.298299,0.307010,0.079972,0.326884,0.346617,0.452269,0.387015,2,14,0.133649
7317,1,34,Russia,Degtyarsk,3,Android,ads,movie,1,0.332732,...,0.254536,0.278108,0.266476,0.285857,0.327182,0.436267,0.373043,2,14,0.145300
7318,1,34,Russia,Degtyarsk,3,Android,ads,movie,1,0.371307,...,0.303001,0.293739,0.271668,0.303696,0.343560,0.432724,0.388639,2,14,0.155689


In [46]:
recommended_posts = filtered_.sort_values("prediction")[-limit:].index
recommended_posts


Index([4633, 6411, 3264, 4505, 4861], dtype='int64', name='post_id')

In [47]:
response = content.iloc[recommended_posts].copy()
response

Unnamed: 0,post_id,text,topic
4633,4811,Boasting some pretty good Rick Baker-esque spe...,movie
6411,6665,"Yet another son who wont grow up flick, and ju...",movie
3264,3407,We went from don’t wear masks to wear masks or...,covid
4505,4681,This was the second of three films that Irving...,movie
4861,5043,Up until the sixth and last episode of the Sta...,movie


In [30]:
def recommended_posts(
        id: int,
        time: datetime,
        limit: int = 5):

    # Признаки пользователя
    logger.info(f"user_id: {id}")
    logger.info("reading feachures")
    user_feachures = feachures[2].loc[feachures[2]["user_id"] == id].copy()
    user_feachures.drop('user_id', axis=1, inplace=True)

    # Признаки постов
    logger.info("dropping columns")
    post_feachures = feachures[1].drop(["index", "text"], axis=1)
    content = feachures[1][["post_id", "text", "topic"]]

    # Объединение признаков
    logger.info("zipping everything")
    request_data = pd.merge(
        user_feachures, post_feachures, how='cross')
    request_data = request_data.set_index("post_id")

    # Добавление даты
    logger.info("adding time info")
    request_data['weekday'] = time.weekday()
    request_data['hour'] = time.hour

    # Предсказание вероятности для 1 класса (вероятнее всего понравится)
    logger.info("predicting")
    probabilities = model.predict_proba(request_data)[:, 1]
    request_data['prediction'] = probabilities

    # Убираем посты, где пользователь уже поставил "like"
    logger.info("deleting liked posts")
    liked_posts = feachures[0]
    liked_posts = liked_posts[liked_posts["user_id"] == id]["post_id"].values
    filtered_ = request_data[~request_data.index.isin(liked_posts)]

    # Получение топ-5 индексов вероятностей
    recommended_posts = filtered_.sort_values("prediction")[-limit:].index
    response = content.iloc[recommended_posts].copy()

    # Форматирование выдачи согласно шаблону
    response.rename(columns={'post_id': 'id'}, inplace=True)
    result_list = response.to_dict(orient='records')

    return result_list

In [31]:
recommended_posts(id, time)

[32m2024-04-14 16:56:05.054[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m7[0m - [1muser_id: 200[0m
[32m2024-04-14 16:56:05.056[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m8[0m - [1mreading feachures[0m
[32m2024-04-14 16:56:05.059[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m13[0m - [1mdropping columns[0m
[32m2024-04-14 16:56:05.062[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m18[0m - [1mzipping everything[0m
[32m2024-04-14 16:56:05.076[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m24[0m - [1madding time info[0m
[32m2024-04-14 16:56:05.078[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m29[0m - [1mpredicting[0m
[32m2024-04-14 16:56:05.108[0m | [1mINFO    [0m | [36m__main__[0m:[36mrecommended_posts[0m:[36m34[0m - [1mdeleting liked posts[0m


[{'id': 4811,
  'text': 'Boasting some pretty good Rick Baker-esque special effects and Deran Serafian in a small role, this pretty lame Italian movie deserves some recognition. Cerchi gets some credit for still making gore flicks while most of the other Italian directors (Ruggero Deodato, Sergio Martino, Lamberto Bava, and Enzo G. Castellari) have moved on to lower-key TV movies. As for plankton, its half Piranha - half The Thing, with people turninging into monsters, raping women, and causing general mayhem. The ultra-grimy, sleazy, and over-sexed feel of the film makes it hard to enjoy. Only available in Italian language work-prints floating around.',
  'topic': 'movie'},
 {'id': 6665,
  'text': 'Yet another son who wont grow up flick, and just the other recent like entries. Heder in another bad wig, channeling Napoleon for, what, the third time? Anna Faris is forgettable, as always; Jeff Daniels phoned this one in from another state, at least; and Diane Keaton...how does one become