# Инициализация данных

In [1]:
import os
import pandas as pd
import numpy as np
import catboost

from sqlalchemy import create_engine
from datetime import datetime
from catboost import CatBoostClassifier
from IPython.display import clear_output

from dotenv import load_dotenv


load_dotenv()

True

In [2]:
# Путь модели
def get_model_path(path: str) -> str:
    if os.environ.get("IS_LMS") == "1":
        MODEL_PATH = '/workdir/user_input/model'
    else:
        MODEL_PATH = path
    return MODEL_PATH

# Загрузка модели
def load_models():
    model_path = get_model_path(
        "/Users/dmitry/Documents/code/Start_ML/module_2/final_project/model/catboost_5_1")
    cat_feachures = ['country', 'city', 'os', 'source', 'topic']
    embedding_features = ['vector']
    from_file = CatBoostClassifier()
    model = from_file.load_model(model_path, format='cbm')
    return model

# Загрузка данных по чанкам
def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(os.getenv("POSTGRES_CONN"))
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)


# Основная функция загрузки признаков из БД
def load_features(table: str = 'd_trubitsin_user_feachures_lesson_22') -> pd.DataFrame:
    data = batch_load_sql(f'SELECT * FROM {table}')
    return data

In [3]:
model = load_models()
user_data = load_features('d_trubitsin_user_feachures_lesson_22')
post_data = load_features('d_trubitsin_post_feachures_lesson_22')
all_posts = load_features('public.post_text_df')

In [None]:
model = load_models()

In [5]:
model.get_cat_feature_indices()

[2, 3, 5, 6, 7]

In [10]:
user_data

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic
163201,168549,0,18,Russia,Tula,2,Android,organic
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic
163203,168551,0,38,Russia,Moscow,3,iOS,organic


In [11]:
post_data

Unnamed: 0,post_id,topic,vector_knn
0,1,business,0.082276
1,2,business,0.115100
2,3,business,0.065227
3,4,business,0.166756
4,5,business,0.128059
...,...,...,...
7018,7315,movie,0.118923
7019,7316,movie,0.102609
7020,7317,movie,0.119902
7021,7318,movie,0.106095


In [12]:
all_posts

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


# Работа endpoint

In [13]:
id = 200
time = datetime(year=2021, month=11, day=3, hour=14)
limit = 5

Создание таблицы для пользователя со всеми постами

In [14]:
request_data = pd.merge(user_data[user_data['user_id'] == id], post_data, how='cross')
request_data

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,post_id,topic,vector_knn
0,200,1,34,Russia,Degtyarsk,3,Android,ads,1,business,0.082276
1,200,1,34,Russia,Degtyarsk,3,Android,ads,2,business,0.115100
2,200,1,34,Russia,Degtyarsk,3,Android,ads,3,business,0.065227
3,200,1,34,Russia,Degtyarsk,3,Android,ads,4,business,0.166756
4,200,1,34,Russia,Degtyarsk,3,Android,ads,5,business,0.128059
...,...,...,...,...,...,...,...,...,...,...,...
7018,200,1,34,Russia,Degtyarsk,3,Android,ads,7315,movie,0.118923
7019,200,1,34,Russia,Degtyarsk,3,Android,ads,7316,movie,0.102609
7020,200,1,34,Russia,Degtyarsk,3,Android,ads,7317,movie,0.119902
7021,200,1,34,Russia,Degtyarsk,3,Android,ads,7318,movie,0.106095


Добавление признаков времени

In [15]:
request_data['weekday'] = time.weekday()
request_data['hour'] = time.hour
request_data.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,post_id,topic,vector_knn,weekday,hour
0,200,1,34,Russia,Degtyarsk,3,Android,ads,1,business,0.082276,2,14
1,200,1,34,Russia,Degtyarsk,3,Android,ads,2,business,0.1151,2,14
2,200,1,34,Russia,Degtyarsk,3,Android,ads,3,business,0.065227,2,14
3,200,1,34,Russia,Degtyarsk,3,Android,ads,4,business,0.166756,2,14
4,200,1,34,Russia,Degtyarsk,3,Android,ads,5,business,0.128059,2,14


Подготовка данных для модели

In [16]:
request_data.drop(['user_id', 'post_id'], axis=1, inplace=True)
request_data.head()

Unnamed: 0,gender,age,country,city,exp_group,os,source,topic,vector_knn,weekday,hour
0,1,34,Russia,Degtyarsk,3,Android,ads,business,0.082276,2,14
1,1,34,Russia,Degtyarsk,3,Android,ads,business,0.1151,2,14
2,1,34,Russia,Degtyarsk,3,Android,ads,business,0.065227,2,14
3,1,34,Russia,Degtyarsk,3,Android,ads,business,0.166756,2,14
4,1,34,Russia,Degtyarsk,3,Android,ads,business,0.128059,2,14


Получение списка вероятностей для класса 1

In [17]:
probabilities = model.predict_proba(request_data)[:, 1]
probabilities

array([0.0776796 , 0.08338456, 0.07913245, ..., 0.12599529, 0.12268114,
       0.12599529])

Получение индексов топ 5 вероятностей

In [None]:
top_5_indices = np.argsort(probabilities)[::-1][:limit]
top_5_indices

In [None]:
probabilities[top_5_indices]

In [None]:
# проверка
max(probabilities) == probabilities[top_5_indices[0]]

In [None]:
all_posts.iloc[top_5_indices]

In [None]:
response = all_posts.iloc[top_5_indices].copy()
response.rename(columns={'post_id': 'id'}, inplace=True)

In [None]:
hitrate = 0

for user in user_data['user_id'].unique()[:2000]:

    # Получение таблицы данных для одного пользователя и всех постов

    request_data = pd.merge(
        user_data[user_data['user_id'] == id], post_data, how='cross')

    request_data['weekday'] = time.weekday()
    request_data['hour'] = time.hour

    request_data.drop(['user_id', 'post_id'], axis=1, inplace=True)

    probabilities = model.predict_proba(request_data)[:, 1]

    top_5_indices = np.argsort(probabilities)[::-1][:limit]

    response = all_posts.iloc[top_5_indices].copy()


    # Если хоть 1 из 5 предсказаний угадано, то 1
    if y_req.iloc[top_5_indices].sum() > 0:
        hitrate += 1
    
    print(hitrate)

clear_output()

print(f'Hitrate@5 \t {hitrate/len(test_data["user_id"].unique()[:2000])}')