In [1]:
!pip install sentence-transformers
!conda install -c conda-forge ipywidgets & echo y
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

%load_ext autoreload
%autoreload 2

y
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
import pandas as pd
from sqlalchemy import create_engine
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

import os
import numpy as np

con = "postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"
engine = create_engine(con)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
def get_embddings(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [4]:
from tqdm import tqdm

text_embeds_path = os.path.join('data', 'text_embeds.csv')
if not os.path.exists(text_embeds_path):
    texts = pd.read_sql("select distinct * from public.post_text_df", engine)
    matrix = pd.DataFrame()

    for chunk in tqdm(np.array_split(texts, 100)):
        embeds = get_embddings(chunk['text'].values.tolist())
        res = pd.concat([chunk['post_id'].reset_index(drop=True).rename('post_id'), pd.DataFrame(embeds)], axis=1, ignore_index=True)
        res = res.rename(columns={0: 'post_id'})
        matrix = pd.concat([matrix, res], axis=0)

    matrix.to_csv(text_embeds_path)
else:
    matrix = pd.read_csv(text_embeds_path)

In [5]:
c1 = pd.read_sql("select count(*) from public.user_data", engine)
c2 = pd.read_sql("select count(*) from public.post_text_df", engine)
c3 = pd.read_sql("select count(*) from public.feed_data", engine)
c1, c2, c3

(    count
 0  163205,
    count
 0   7023,
       count
 0  76892800)

In [14]:
%%time
query = """
with fd as (
    select
        *
    from public.feed_data
    where 1=1 
    and action = 'view' -- т.к. есть столбец target, то уже на нём можно обучить модель классификации
    and user_id in (select user_id from (select distinct user_id from public.feed_data) tt where random() < 0.02)
    -- and user_id = '113330'
)
select *
from fd
left join (select * from public.post_text_df) ptd on fd.post_id = ptd.post_id
left join public.user_data ud on ud.user_id = fd.user_id
limit 100000
"""
df = pd.read_sql(query, engine).iloc[:, [0,1,2,4,7,9,10,11,12,13,14,15]]
df.shape

CPU times: user 692 ms, sys: 251 ms, total: 943 ms
Wall time: 28.1 s


Unnamed: 0,timestamp,user_id,post_id,target,topic,gender,age,country,city,exp_group,os,source
0,2021-10-01 19:10:22,6203,1191,0,politics,1,21,Russia,Kargasok,3,iOS,ads
1,2021-10-01 19:10:52,6203,5032,0,movie,1,21,Russia,Kargasok,3,iOS,ads
2,2021-10-01 19:13:20,6203,6186,0,movie,1,21,Russia,Kargasok,3,iOS,ads
3,2021-10-01 19:15:34,6203,1885,1,sport,1,21,Russia,Kargasok,3,iOS,ads
4,2021-10-01 19:28:16,6203,2741,0,covid,1,21,Russia,Kargasok,3,iOS,ads
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2021-10-31 14:55:30,15161,6868,1,movie,1,28,Turkey,Istanbul,1,Android,ads
99996,2021-10-31 15:01:22,15161,947,0,politics,1,28,Turkey,Istanbul,1,Android,ads
99997,2021-10-31 15:02:05,15161,3267,0,covid,1,28,Turkey,Istanbul,1,Android,ads
99998,2021-10-31 15:04:43,15161,409,0,business,1,28,Turkey,Istanbul,1,Android,ads


In [28]:
dfm = df.merge(matrix, how='left', on='post_id').drop(columns=['Unnamed: 0'])
dfm = dfm.rename(columns = lambda x: f'emb_{x}' if x.isdigit() else x)

In [34]:
dfm.tail(5)

Unnamed: 0,timestamp,user_id,post_id,target,topic,gender,age,country,city,exp_group,...,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383,emb_384
99995,2021-10-31 14:55:30,15161,6868,1,movie,1,28,Turkey,Istanbul,1,...,0.09137,0.108109,-0.000497,0.009185,-0.015838,-0.031288,0.050329,-0.05057,-0.058027,0.026001
99996,2021-10-31 15:01:22,15161,947,0,politics,1,28,Turkey,Istanbul,1,...,0.082669,-0.004322,-0.017002,-0.025996,-0.114061,0.032967,-0.035295,-0.085248,0.00364,0.046821
99997,2021-10-31 15:02:05,15161,3267,0,covid,1,28,Turkey,Istanbul,1,...,-0.01969,0.02703,-0.017846,-0.020114,-0.068193,0.038093,0.042887,-0.038444,0.026767,0.036529
99998,2021-10-31 15:04:43,15161,409,0,business,1,28,Turkey,Istanbul,1,...,0.010609,-0.010446,0.065055,-0.015263,-0.13557,-0.007048,0.030626,-0.134531,0.060149,0.031814
99999,2021-10-31 15:07:32,15161,2748,0,covid,1,28,Turkey,Istanbul,1,...,-0.01577,0.039664,0.038339,0.058729,0.018482,0.055541,0.011315,0.006218,-0.130534,0.016373


### Тестовая выборка после 12 декабря, обучающая до этого числа
### Фичи
* timestamp для разделения выборки
* user_id, post_id - индексы
* text_embed_xxx - эмбеддинги текстов
* категориальные признаки: topic, gender, country, city, exp_group, os, source
* непрервыные признаки: age

### Таргет
* target - был лайк или нет

In [45]:
user_cols = ['gender', 'age', 'country', 'city', 'os', 'source', 'exp_group']
item_cols = ['topic'] + [x for x in dfm.columns if x.startswith('emb')]

In [56]:
train, test = dfm[dfm['timestamp'] <= '2021-12-12'], dfm[dfm['timestamp'] > '2021-12-12']

X_train, y_train = train.drop(columns=['target', 'user_id', 'post_id', 'timestamp']), train['target']
X_test, y_test = test.drop(columns=['target', 'user_id', 'post_id', 'timestamp']), test['target']

In [59]:
X_train.head(2)

Unnamed: 0,topic,gender,age,country,city,exp_group,os,source,emb_1,emb_2,...,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383,emb_384
0,politics,1,21,Russia,Kargasok,3,iOS,ads,-0.003308,-0.048555,...,0.044759,-0.050147,-0.004343,0.001642,-0.076406,-0.007989,-0.064139,-0.056068,-0.035511,0.06805
1,movie,1,21,Russia,Kargasok,3,iOS,ads,-0.024698,-0.001368,...,0.029774,-0.031479,0.048971,0.026845,-0.051122,0.010619,0.00497,0.019181,-0.014201,-0.024205


In [60]:
from catboost import CatBoostClassifier, Pool, metrics, cv
catboost_model = CatBoostClassifier(learning_rate=0.02)

cat_features = ['topic', 'gender', 'age', 'country', 'city', 'exp_group', 'os', 'source']
catboost_model.fit(X_train, y_train, cat_features=cat_features)

catboost_model.save_model('catboost_model',
                           format="cbm")

from_file = CatBoostClassifier()  # здесь не указываем параметры, которые были при обучении, в дампе модели все есть

from_file.load_model("catboost_model")

from_file.predict(X_train)

0:	learn: 0.6775930	total: 297ms	remaining: 4m 56s
1:	learn: 0.6627713	total: 452ms	remaining: 3m 45s
2:	learn: 0.6486355	total: 590ms	remaining: 3m 16s
3:	learn: 0.6351284	total: 767ms	remaining: 3m 10s
4:	learn: 0.6222672	total: 906ms	remaining: 3m
5:	learn: 0.6100072	total: 1.08s	remaining: 2m 59s
6:	learn: 0.5983227	total: 1.22s	remaining: 2m 53s
7:	learn: 0.5871588	total: 1.39s	remaining: 2m 52s
8:	learn: 0.5765316	total: 1.55s	remaining: 2m 50s
9:	learn: 0.5663948	total: 1.68s	remaining: 2m 46s
10:	learn: 0.5567582	total: 1.91s	remaining: 2m 51s
11:	learn: 0.5475533	total: 2.06s	remaining: 2m 49s
12:	learn: 0.5387900	total: 2.19s	remaining: 2m 46s
13:	learn: 0.5304227	total: 2.36s	remaining: 2m 46s
14:	learn: 0.5224635	total: 2.52s	remaining: 2m 45s
15:	learn: 0.5148633	total: 2.7s	remaining: 2m 46s
16:	learn: 0.5076239	total: 2.89s	remaining: 2m 46s
17:	learn: 0.5007438	total: 3.04s	remaining: 2m 45s
18:	learn: 0.4941643	total: 3.21s	remaining: 2m 46s
19:	learn: 0.4878968	total:

array([0, 0, 0, ..., 0, 0, 0])

'1.4.2'