# Model

In [1]:
import os
import sys

import pathlib

notebook_path = pathlib.Path(os.getcwd())
sys.path.append(str(notebook_path.parent))

In [2]:
import numpy as np
import pandas as pd

import pickle

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.pipeline import Pipeline
from sklearn import decomposition
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    train_test_split
) 
from category_encoders import TargetEncoder

from IPython.display import HTML

from database import engine

## Precalculated tables

Sometimes it's too long to do the calculation at runtime. So in some cases we can prepare pre-computed tables. Here is code to create such tables.

**Note** It is stored in the database in order to be used in production.

TF-IDF is an extremely long operation, but fortunately the post data is constant, so we can pre-compute and store TF-IDF vectors.

## Loading data

- We only load records that belong to the `view` category, because every like starts with view, and we have a `target` column that marks those views that lead to likes.

In [3]:
user_data = pd.read_sql(
    "SELECT * FROM public.user_data;",
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    "SELECT * FROM public.kobfedsur_post_features_lesson_22;",
    con = engine,
    index_col = "post_id"
)
post_data = post_data.drop("text" , axis = 1)
feed_data = pd.read_sql(
    """
    SELECT
        timestamp,
        user_id,
        post_id,
        target
    FROM public.feed_data 
    WHERE action='view' 
    LIMIT 200000;
    """,
    con = engine
)

In [4]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,topic,text tf_idf0,text tf_idf1,text tf_idf2,text tf_idf3,text tf_idf4,text tf_idf5,text tf_idf6,text tf_idf7,text tf_idf8,text tf_idf9
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,business,0.386969,-0.178558,-0.09035,-0.092254,-0.123813,0.026891,-0.090327,0.039118,0.021023,-0.104659
2,business,0.422381,-0.203721,-0.085704,-0.002244,-0.066018,0.031455,0.018915,0.005379,-0.020937,-0.017799
3,business,0.374896,-0.136752,-0.063839,-0.089292,-0.111679,0.020871,-0.058826,0.032294,0.026546,-0.064295
4,business,0.258776,-0.118368,-0.052492,-0.053732,-0.096286,0.022899,-0.034365,0.033169,0.015495,-0.040908
5,business,0.194117,-0.099519,-0.045276,-0.001843,-0.0344,-0.007769,-0.009568,0.018539,0.012742,-0.034889


Unnamed: 0,timestamp,user_id,post_id,target
0,2021-11-26 23:08:22,28346,35,0
1,2021-11-26 23:10:33,28346,908,0
2,2021-11-26 23:11:24,28346,3242,0
3,2021-11-26 23:13:27,28346,6463,0
4,2021-11-26 23:14:25,28346,3528,0


## Data preparation

In [5]:
joined_data = pd.merge(
    left = feed_data,
    right = user_data,
    left_on = "user_id",
    right_index = True
)
joined_data = pd.merge(
    left = joined_data,
    right = post_data,
    left_on = "post_id",
    right_index = True
)

In [6]:
X = joined_data.drop(["user_id", "post_id"], axis = 1).copy()

y = X["target"]
X.drop("target", axis = 1, inplace = True)

X["month"] = X["timestamp"].dt.month
X["year"] = X["timestamp"].dt.year
X["hour"] = X["timestamp"].dt.hour
X.drop("timestamp", axis = 1, inplace = True)

X['gender'] = X['gender'].astype("O")
X['exp_group'] = X['exp_group'].astype("O")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size = 0.8, random_state = 1
)

## Model pipeline

In [8]:
numeric_columns = list(X_train.select_dtypes("number").columns)
poly_columns = ["age", "month", "year", "hour"]

categorical_columns = [
    'gender', 'country', 'city', 'os', 'source', 'topic', 'exp_group'
]
cat_nunique = X_train[categorical_columns].nunique()
MTE_columns = cat_nunique.index[cat_nunique > 5].to_list()
OHE_columns = list(set(categorical_columns) - set(MTE_columns))

In [9]:
data_transformer = ColumnTransformer([
    (
        "numeric_transform",
        Pipeline([
            (
                "poly_columns",
                ColumnTransformer(
                    [("poly_features", PolynomialFeatures(), poly_columns)]
                )
            ),
            ("stand_scaler", StandardScaler())
        ]), 
        numeric_columns
    ),
    (
        "one_hot_encod", 
        OneHotEncoder(
            categories = list(X_train[OHE_columns].apply(
                lambda col: list(col.unique()),
                result_type="reduce"
            ))
        ), 
        OHE_columns
    ),
    (
        "mean_target_encoder", 
        TargetEncoder(min_samples_leaf = 0.5, smoothing = 0.5), 
        MTE_columns
    )
])

In [10]:
pipeline = Pipeline([
    ("transfmer", data_transformer),
    ("model", GradientBoostingClassifier())
])

## Model selection

In [11]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))
gs_results_frame = pd.DataFrame(grid_search_result.cv_results_["params"])
gs_results_frame["mean_train_score"] = grid_search_result.cv_results_["mean_test_score"]
gs_results_frame["mean_test_score"] = grid_search_result.cv_results_["mean_train_score"]
gs_results_frame.sort_values("mean_train_score", ascending = False)

Unnamed: 0,model__learning_rate,model__max_depth,model__n_estimators,mean_train_score,mean_test_score
18,0.2,4,150,0.660291,0.700637
17,0.2,4,100,0.659999,0.692019
3,0.17,4,150,0.659993,0.697039
16,0.2,4,95,0.659798,0.690896
2,0.17,4,100,0.659729,0.688765
1,0.17,4,95,0.659622,0.687662
15,0.2,4,90,0.659607,0.689742
0,0.17,4,90,0.659546,0.686855
4,0.17,4,250,0.658893,0.708787
19,0.2,4,250,0.658675,0.712825


In [12]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))

best_model = pipeline.set_params(
    **grid_search_result.best_params_
).fit(X_train, y_train)

file_name = "model.pck"
pickle.dump(best_model, open(file_name, "wb"))

## Hitrage@5 estimation

In [13]:
preds_test = pd.Series(
    best_model.predict_proba(X_test)[:,1],
    index = y_test.index
)

data_for_estimation = pd.DataFrame({
    "preds" : preds_test,
    "y" : y_test,
    "user_id" : joined_data.loc[y_test.index, "user_id"]
})

hit_rages = (
    data_for_estimation.
    groupby("user_id").
    apply(lambda x: x.nlargest(5, "preds")["y"].any() if len(x) >= 5 else None)
).dropna().astype("int32")

print("current hitrage@5", hit_rages.mean())

current hitrage@5 0.6300813008130082
