# Model

In [1]:
import os
import sys

import pathlib

notebook_path = pathlib.Path(os.getcwd())
sys.path.append(str(notebook_path.parent))

In [30]:
import numpy as np
import pandas as pd

import pickle

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder
)
from sklearn.pipeline import Pipeline
from sklearn import decomposition
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    train_test_split
) 
from category_encoders import TargetEncoder

from IPython.display import HTML

from database import engine

## Precalculated tables

Sometimes it's too long to do the calculation at runtime. So in some cases we can prepare pre-computed tables. Here is code to create such tables.

**Note** It is stored in the database in order to be used in production.

TF-IDF is an extremely long operation, but fortunately the post data is constant, so we can pre-compute and store TF-IDF vectors.

In [54]:
post_data = pd.read_sql(
    "SELECT * FROM public.post_text_df;",
    con = engine,
    index_col = "post_id"
)

n_components = 10
tf_idf_frame = pd.DataFrame(
    Pipeline([
        ("vectoriser", TfidfVectorizer()),
        ("trunk_SVD", decomposition.TruncatedSVD(n_components = n_components))
    ]).fit_transform(
        post_data["text"]
    ),
    columns = [f"text tf_idf{i}" for i in range(n_components)],
    index = post_data.index
)

post_data.join(tf_idf_frame).to_sql(
    con = engine,
    name = "kobfedsur_post_features_lesson_22",
    if_exists = "replace"
)

23

## Loading data

- We only load records that belong to the `view` category, because every like starts with view, and we have a `target` column that marks those views that lead to likes.

In [4]:
user_data = pd.read_sql(
    "SELECT * FROM public.user_data;",
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    "SELECT * FROM public.kobfedsur_post_features_lesson_22;",
    con = engine,
    index_col = "post_id"
)
post_data = post_data.drop("text" , axis = 1)
feed_data = pd.read_sql(
    """
    SELECT
        timestamp,
        user_id,
        post_id,
        target
    FROM public.feed_data 
    WHERE action='view' 
    LIMIT 200000;
    """,
    con = engine
)

In [5]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,topic,text tf_idf0,text tf_idf1,text tf_idf2,text tf_idf3,text tf_idf4,text tf_idf5,text tf_idf6,text tf_idf7,text tf_idf8,...,text tf_idf40,text tf_idf41,text tf_idf42,text tf_idf43,text tf_idf44,text tf_idf45,text tf_idf46,text tf_idf47,text tf_idf48,text tf_idf49
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,business,0.0,0.0,0.041933,0.276226,0.038811,0.036599,0.076387,0.0,0.035688,...,0.093371,0.036455,0.0,0.049809,0.0,0.0,0.142139,0.0,0.0,0.0
2,business,0.037703,0.0,0.066779,0.197953,0.185424,0.029142,0.091236,0.119392,0.056834,...,0.037174,0.029027,0.0,0.118983,0.0,0.108888,0.188632,0.02733,0.03902,0.0
3,business,0.234693,0.0,0.0,0.293383,0.054963,0.259148,0.108176,0.026542,0.02527,...,0.033057,0.077438,0.0,0.176343,0.0,0.0,0.0,0.097214,0.069398,0.0
4,business,0.0,0.0,0.074263,0.146757,0.068734,0.0,0.0,0.0,0.0,...,0.0,0.12912,0.0,0.0,0.0,0.0,0.167816,0.0,0.086786,0.0
5,business,0.0,0.0,0.094665,0.187076,0.175236,0.082623,0.086223,0.084624,0.0,...,0.0,0.0,0.0,0.0,0.106608,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,timestamp,user_id,post_id,target
0,2021-11-18 13:08:41,152872,5410,0
1,2021-11-18 13:10:58,152872,1472,0
2,2021-11-18 13:11:32,152872,3554,0
3,2021-11-18 13:12:00,152872,4963,0
4,2021-11-18 13:13:26,152872,7100,1


## Data preparation

In [6]:
joined_data = pd.merge(
    left = feed_data,
    right = user_data,
    left_on = "user_id",
    right_index = True
)
joined_data = pd.merge(
    left = joined_data,
    right = post_data,
    left_on = "post_id",
    right_index = True
)

In [7]:
X = joined_data.drop(["user_id", "post_id"], axis = 1).copy()

y = X["target"]
X.drop("target", axis = 1, inplace = True)

X["month"] = X["timestamp"].dt.month
X["year"] = X["timestamp"].dt.year
X["hour"] = X["timestamp"].dt.hour
X.drop("timestamp", axis = 1, inplace = True)

X['gender'] = X['gender'].astype("O")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size = 0.8, random_state = 1
)

## Model pipeline

In [13]:
numeric_columns = list(X_train.select_dtypes("number").columns)
categorical_columns = [
    'gender', 'country', 'city', 'os', 'source', 'topic'
]
cat_nunique = X_train[categorical_columns].nunique()
MTE_columns = cat_nunique.index[cat_nunique > 5].to_list()
OHE_columns = list(set(categorical_columns) - set(MTE_columns))

In [23]:
X_train["exp_group"].value_counts()

0    36973
1    35805
3    30127
4    28712
2    28383
Name: exp_group, dtype: int64

In [20]:
numeric_columns

['age',
 'exp_group',
 'text tf_idf0',
 'text tf_idf1',
 'text tf_idf2',
 'text tf_idf3',
 'text tf_idf4',
 'text tf_idf5',
 'text tf_idf6',
 'text tf_idf7',
 'text tf_idf8',
 'text tf_idf9',
 'text tf_idf10',
 'text tf_idf11',
 'text tf_idf12',
 'text tf_idf13',
 'text tf_idf14',
 'text tf_idf15',
 'text tf_idf16',
 'text tf_idf17',
 'text tf_idf18',
 'text tf_idf19',
 'text tf_idf20',
 'text tf_idf21',
 'text tf_idf22',
 'text tf_idf23',
 'text tf_idf24',
 'text tf_idf25',
 'text tf_idf26',
 'text tf_idf27',
 'text tf_idf28',
 'text tf_idf29',
 'text tf_idf30',
 'text tf_idf31',
 'text tf_idf32',
 'text tf_idf33',
 'text tf_idf34',
 'text tf_idf35',
 'text tf_idf36',
 'text tf_idf37',
 'text tf_idf38',
 'text tf_idf39',
 'text tf_idf40',
 'text tf_idf41',
 'text tf_idf42',
 'text tf_idf43',
 'text tf_idf44',
 'text tf_idf45',
 'text tf_idf46',
 'text tf_idf47',
 'text tf_idf48',
 'text tf_idf49',
 'month',
 'year',
 'hour']

In [17]:
from sklearn.preprocessing import PowerTransformer

In [19]:
Pipeline([
    ("polynomial_features", PolynomialFeatures()),
    ("stand_scaler", StandardScaler())
]).fit_transform(X[numeric_columns]).shape

(200000, 1596)

In [9]:
data_transformer = ColumnTransformer([
    (
        "stand_scaler", 
        StandardScaler(), 
        numeric_columns
    ),
    (
        "one_hot_encod", 
        OneHotEncoder(
            categories = list(X_train[OHE_columns].apply(
                lambda col: list(col.unique()),
                result_type="reduce"
            ))
        ), 
        OHE_columns
    ),
    (
        "mean_target_encoder", 
        TargetEncoder(min_samples_leaf = 0.5, smoothing = 0.5), 
        MTE_columns
    )
])

In [10]:
pipeline = Pipeline([
    ("transfmer", data_transformer),
    ("model", GradientBoostingClassifier())
])

## Model selection

In [12]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))
gs_results_frame = pd.DataFrame(grid_search_result.cv_results_["params"])
gs_results_frame["mean_train_score"] = grid_search_result.cv_results_["mean_test_score"]
gs_results_frame["mean_test_score"] = grid_search_result.cv_results_["mean_train_score"]
gs_results_frame.sort_values("mean_train_score", ascending = False)

Unnamed: 0,model__learning_rate,model__max_depth,model__n_estimators,mean_train_score,mean_test_score
13,0.2,4,90,0.649974,0.706459
14,0.2,4,100,0.649739,0.710523
12,0.2,4,80,0.649569,0.702076
5,0.15,4,100,0.64901,0.700533
4,0.15,4,90,0.648766,0.696825
19,0.3,3,90,0.648259,0.68836
18,0.3,3,80,0.64814,0.685308
3,0.15,4,80,0.648085,0.692487
20,0.3,3,100,0.648066,0.691151
11,0.2,3,100,0.648022,0.681211


In [13]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))

best_model = pipeline.set_params(
    **grid_search_result.best_params_
).fit(X_train, y_train)

file_name = "model.pck"
pickle.dump(best_model, open(file_name, "wb"))

## Hitrage@5 estimation

In [14]:
preds_test = pd.Series(
    best_model.predict_proba(X_test)[:,1],
    index = y_test.index
)

data_for_estimation = pd.DataFrame({
    "preds" : preds_test,
    "y" : y_test,
    "user_id" : joined_data.loc[y_test.index, "user_id"]
})

hit_rages = (
    data_for_estimation.
    groupby("user_id").
    apply(lambda x: x.nlargest(5, "preds")["y"].any() if len(x) >= 5 else None)
).dropna().astype("int32")

print("current hitrage@5", hit_rages.mean())

current hitrage@5 0.6276595744680851
