# Model

In [1]:
import os
import sys

import pathlib

notebook_path = pathlib.Path(os.getcwd())
sys.path.append(str(notebook_path.parent))

In [2]:
import numpy as np
import pandas as pd

import pickle

from catboost import CatBoostClassifier

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import decomposition
from sklearn.metrics import roc_auc_score

from sklearn.cluster import KMeans
from category_encoders import TargetEncoder

import matplotlib.pyplot as plt


from IPython.display import HTML

from database import engine

## Precalculated tables

Sometimes it's too long to do the calculation at runtime. So in some cases we can prepare pre-computed tables. Here is code to create such tables.

**Note** It is stored in the database in order to be used in production.

### Text transfomation

Posts have text. This section provides methods that can be used for column with posts text transformation.

#### tf-idf transformation

#### Bert embedings

For each text of the post the bert model was applied (learn more in notebook `bert_post_proc.ipynb`) and got embeddings for each text. These embeddings we can try to use as features of the model.

### Saving result

## Loading data

- We only load records that belong to the `view` category, because every like starts with view, and we have a `target` column that marks those views that lead to likes.

In [3]:
user_data = pd.read_sql(
    "SELECT * FROM public.user_data;",
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    "SELECT * FROM public.kobfedsur_post_features_lesson_22;",
    con = engine,
    index_col = "post_id"
)
post_data = post_data.drop("text" , axis = 1)
feed_data = pd.read_sql(
    """
    SELECT
        timestamp,
        user_id,
        post_id,
        target
    FROM public.feed_data 
    WHERE action='view' 
    LIMIT 200000;
    """,
    con = engine
)

In [4]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,topic,post_text_embading 0,post_text_embading 1,post_text_embading 2,post_text_embading 3,post_text_embading 4,post_text_embading 5,post_text_embading 6,post_text_embading 7,post_text_embading 8,...,post_text_embading 10,post_text_embading 11,post_text_embading 12,post_text_embading 13,post_text_embading 14,post_text_embading 15,post_text_embading 16,post_text_embading 17,post_text_embading 18,post_text_embading 19
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,business,3.082143,-0.872354,-1.121495,0.695181,-0.051382,0.234759,-0.309031,0.000415,0.246711,...,-0.191662,0.302546,-0.188467,0.420337,0.113366,0.392565,-0.279607,0.215437,-0.685032,0.158396
2,business,2.298754,-0.771812,-1.484304,0.921823,-0.043767,0.577131,0.102368,0.678323,1.163995,...,-0.136326,0.88824,-0.119025,-0.179916,0.112205,-0.269955,-0.491519,0.256519,0.049968,0.113517
3,business,3.830453,-0.031835,-1.308002,-2.101631,-0.484518,-0.095322,0.206714,-0.808237,-0.210448,...,-0.346497,-0.184492,0.272497,-0.231417,0.705615,-0.840136,0.08865,0.406348,-0.313337,-0.42315
4,business,2.248879,0.231341,-1.637733,-1.68515,-0.175615,-0.363438,0.877843,-0.201348,-0.476309,...,0.29197,-0.052501,-0.209298,-0.44313,-0.339549,-0.019009,-0.244962,-0.009588,0.247985,0.20314
5,business,3.291842,-0.10071,-1.841415,-1.857096,-0.102616,-0.475621,0.359757,-0.000836,-0.667514,...,0.179548,0.311436,-0.000996,-0.448443,-0.254442,-0.079604,-0.401943,-0.198304,0.033986,0.154987


Unnamed: 0,timestamp,user_id,post_id,target
0,2021-11-07 13:52:56,71643,2122,0
1,2021-11-07 13:55:23,71643,772,0
2,2021-11-07 13:57:17,71643,524,0
3,2021-11-07 13:59:34,71643,6519,0
4,2021-11-07 14:02:29,71643,2025,0


In [5]:
user_data["source"].value_counts()

ads        101685
organic     61520
Name: source, dtype: int64

## Data preparation

In [6]:
joined_data = pd.merge(
    left = feed_data,
    right = user_data,
    left_on = "user_id",
    right_index = True
)
joined_data = pd.merge(
    left = joined_data,
    right = post_data,
    left_on = "post_id",
    right_index = True
)

In [7]:
X = joined_data.drop(["user_id", "post_id"], axis = 1).copy()

y = X["target"]
X.drop("target", axis = 1, inplace = True)

X["month"] = X["timestamp"].dt.month
X["year"] = X["timestamp"].dt.year
X["hour"] = X["timestamp"].dt.hour
X_times = X["timestamp"]
X.drop("timestamp", axis = 1, inplace = True)

X['gender'] = X['gender'].astype("O")
X['exp_group'] = X['exp_group'].astype("O")

In [8]:
train_test_tres = X_times.quantile(0.8)

X_train = X.loc[train_test_tres>=X_times]
X_test = X.loc[train_test_tres<X_times]
y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

## Model fitting

In [9]:
numeric_columns = list(X_train.select_dtypes("number").columns)
categorical_columns = [
    'gender', 'country', 'city', 'os', 'source', 'topic', 'exp_group'
]

model = CatBoostClassifier(
    cat_features=categorical_columns, random_seed=10
).fit(X_train, y_train)

file_name = "model.pck"
pickle.dump(model, open(file_name, "wb"))

Learning rate set to 0.089969
0:	learn: 0.6262920	total: 99ms	remaining: 1m 38s
1:	learn: 0.5737384	total: 151ms	remaining: 1m 15s
2:	learn: 0.5320697	total: 201ms	remaining: 1m 6s
3:	learn: 0.4987830	total: 253ms	remaining: 1m 2s
4:	learn: 0.4713993	total: 304ms	remaining: 1m
5:	learn: 0.4496663	total: 359ms	remaining: 59.4s
6:	learn: 0.4317431	total: 396ms	remaining: 56.1s
7:	learn: 0.4173632	total: 454ms	remaining: 56.2s
8:	learn: 0.4061465	total: 506ms	remaining: 55.7s
9:	learn: 0.3971937	total: 539ms	remaining: 53.4s
10:	learn: 0.3897851	total: 595ms	remaining: 53.5s
11:	learn: 0.3827104	total: 644ms	remaining: 53s
12:	learn: 0.3767764	total: 698ms	remaining: 53s
13:	learn: 0.3720954	total: 755ms	remaining: 53.1s
14:	learn: 0.3682013	total: 818ms	remaining: 53.7s
15:	learn: 0.3658555	total: 835ms	remaining: 51.4s
16:	learn: 0.3631643	total: 884ms	remaining: 51.1s
17:	learn: 0.3610944	total: 934ms	remaining: 51s
18:	learn: 0.3595021	total: 963ms	remaining: 49.7s
19:	learn: 0.358488

In [10]:
roc_auc_score(
    y_test,
    model.predict_proba(
        X_test
    )[:,1]
)

0.6404758323528218