# Model

In [1]:
import os
import sys

import pathlib

notebook_path = pathlib.Path(os.getcwd())
sys.path.append(str(notebook_path.parent))

In [2]:
import numpy as np
import pandas as pd

import pickle

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.pipeline import Pipeline
from sklearn import decomposition
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    train_test_split
)
from sklearn.cluster import KMeans
from category_encoders import TargetEncoder

import matplotlib.pyplot as plt


from IPython.display import HTML

from database import engine
import torch

## Precalculated tables

Sometimes it's too long to do the calculation at runtime. So in some cases we can prepare pre-computed tables. Here is code to create such tables.

**Note** It is stored in the database in order to be used in production.

In [3]:
post_data = pd.read_sql(
    "SELECT * FROM public.post_text_df;",
    con = engine,
    index_col = "post_id"
)

### Text transfomation

Posts have text. This section provides methods that can be used for column with posts text transformation.

#### tf-idf transformation

#### Bert embedings

For each text of the post the bert model was applied (learn more in notebook `bert_post_proc.ipynb`) and got embeddings for each text. These embeddings we can try to use as features of the model.

### Saving result

## Loading data

- We only load records that belong to the `view` category, because every like starts with view, and we have a `target` column that marks those views that lead to likes.

In [4]:
user_data = pd.read_sql(
    "SELECT * FROM public.user_data;",
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    "SELECT * FROM public.kobfedsur_post_features_lesson_22;",
    con = engine,
    index_col = "post_id"
)
post_data = post_data.drop("text" , axis = 1)
feed_data = pd.read_sql(
    """
    SELECT
        timestamp,
        user_id,
        post_id,
        target
    FROM public.feed_data 
    WHERE action='view' 
    LIMIT 200000;
    """,
    con = engine
)

In [5]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,topic,post_text_embading 0,post_text_embading 1,post_text_embading 2,post_text_embading 3,post_text_embading 4,post_text_embading 5,post_text_embading 6,post_text_embading 7,post_text_embading 8,post_text_embading 9,post_text_embading 10,post_text_embading 11,post_text_embading 12,post_text_embading 13,post_text_embading 14
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,business,5.271511,1.511544,4.652123,5.718925,5.969097,3.074242,3.958694,2.823006,4.346468,5.542856,5.4379,4.845976,6.058847,2.34038,2.937861
2,business,5.028046,2.305419,4.395994,5.565037,5.591251,2.965271,4.023256,3.378634,4.509531,5.480746,5.448599,4.675762,5.686556,3.359244,3.962052
3,business,5.930407,3.353186,5.225781,6.666854,7.069872,5.62847,5.189015,3.348595,5.738086,6.300123,5.877313,6.474744,6.869722,4.594686,1.965199
4,business,4.989741,3.360993,3.772533,5.292347,5.488238,4.753773,4.233974,2.416443,4.861097,5.079409,4.758761,5.24039,5.416757,4.025181,2.541314
5,business,5.652368,3.143436,4.730937,6.158753,6.420747,5.175661,4.956597,2.841509,5.429492,5.914902,5.606469,6.020569,6.364986,4.116338,1.856815


Unnamed: 0,timestamp,user_id,post_id,target
0,2021-10-09 11:39:20,114033,5228,0
1,2021-10-09 11:41:30,114033,1339,0
2,2021-10-14 06:01:18,114033,1377,0
3,2021-10-14 06:04:17,114033,6378,0
4,2021-10-14 06:06:44,114033,4155,0


## Data preparation

In [6]:
joined_data = pd.merge(
    left = feed_data,
    right = user_data,
    left_on = "user_id",
    right_index = True
)
joined_data = pd.merge(
    left = joined_data,
    right = post_data,
    left_on = "post_id",
    right_index = True
)

In [7]:
X = joined_data.drop(["user_id", "post_id"], axis = 1).copy()

y = X["target"]
X.drop("target", axis = 1, inplace = True)

X["month"] = X["timestamp"].dt.month
X["year"] = X["timestamp"].dt.year
X["hour"] = X["timestamp"].dt.hour
X.drop("timestamp", axis = 1, inplace = True)

X['gender'] = X['gender'].astype("O")
X['exp_group'] = X['exp_group'].astype("O")

In [8]:
X

Unnamed: 0,gender,age,country,city,exp_group,os,source,topic,post_text_embading 0,post_text_embading 1,...,post_text_embading 8,post_text_embading 9,post_text_embading 10,post_text_embading 11,post_text_embading 12,post_text_embading 13,post_text_embading 14,month,year,hour
0,0,19,Russia,Izhevsk,0,Android,organic,movie,3.952942,5.978668,...,4.180769,5.426963,5.961785,3.454106,2.40809,5.49036,6.498664,10,2021,11
4674,1,20,Finland,Espoo,2,Android,organic,movie,3.952942,5.978668,...,4.180769,5.426963,5.961785,3.454106,2.40809,5.49036,6.498664,12,2021,16
21810,1,48,Ukraine,Selydove,1,iOS,organic,movie,3.952942,5.978668,...,4.180769,5.426963,5.961785,3.454106,2.40809,5.49036,6.498664,12,2021,18
47804,1,24,Russia,Voronezh,0,iOS,organic,movie,3.952942,5.978668,...,4.180769,5.426963,5.961785,3.454106,2.40809,5.49036,6.498664,10,2021,18
50380,1,24,Russia,Magaramkent,2,iOS,organic,movie,3.952942,5.978668,...,4.180769,5.426963,5.961785,3.454106,2.40809,5.49036,6.498664,10,2021,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114985,1,25,Russia,Chelyabinsk,3,Android,organic,movie,,,...,,,,,,,,12,2021,8
115161,0,31,Russia,Zavodoukovsk,0,Android,ads,movie,,,...,,,,,,,,10,2021,8
142148,1,17,Ukraine,Donetsk,4,Android,ads,movie,,,...,,,,,,,,11,2021,18
172881,0,22,Russia,Krasnyy Sulin,1,Android,ads,movie,,,...,,,,,,,,11,2021,11


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size = 0.8, random_state = 1
)

## Model pipeline

In [10]:
numeric_columns = list(X_train.select_dtypes("number").columns)
poly_columns = ["age", "month", "year", "hour"]

categorical_columns = [
    'gender', 'country', 'city', 'os', 'source', 'topic', 'exp_group'
]
cat_nunique = X_train[categorical_columns].nunique()
MTE_columns = cat_nunique.index[cat_nunique > 5].to_list()
OHE_columns = list(set(categorical_columns) - set(MTE_columns))

In [11]:
data_transformer = ColumnTransformer([
    (
        "numeric_transform",
        Pipeline([
            (
                "poly_columns",
                ColumnTransformer(
                    [("poly_features", PolynomialFeatures(), poly_columns)]
                )
            ),
            ("stand_scaler", StandardScaler())
        ]), 
        numeric_columns
    ),
    (
        "one_hot_encod", 
        OneHotEncoder(
            categories = list(X_train[OHE_columns].apply(
                lambda col: list(col.unique()),
                result_type="reduce"
            ))
        ), 
        OHE_columns
    ),
    (
        "mean_target_encoder", 
        TargetEncoder(min_samples_leaf = 0.5, smoothing = 0.5), 
        MTE_columns
    )
])

In [14]:
pipeline = Pipeline([
    ("transfmer", data_transformer),
    ("model", GradientBoostingClassifier())
])

## Model selection

In [13]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))
gs_results_frame = pd.DataFrame(grid_search_result.cv_results_["params"])
gs_results_frame["mean_test_score"] = grid_search_result.cv_results_["mean_test_score"]
gs_results_frame["mean_train_score"] = grid_search_result.cv_results_["mean_train_score"]
gs_results_frame.sort_values("mean_test_score", ascending = False)

Unnamed: 0,model__learning_rate,model__max_depth,model__n_estimators,mean_test_score,mean_train_score
2,0.2,4,100,0.659186,0.690354
7,0.25,3,150,0.658515,0.684513
1,0.2,3,150,0.658496,0.680967
3,0.2,4,150,0.658433,0.698846
8,0.25,4,100,0.658348,0.694325
6,0.25,3,100,0.657904,0.678594
9,0.25,4,150,0.657344,0.703042
0,0.2,3,100,0.656983,0.675124
4,0.2,5,100,0.656337,0.709425
10,0.25,5,100,0.655459,0.715557


In [14]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))

best_model = pipeline.set_params(
    **grid_search_result.best_params_
).fit(X_train, y_train)

file_name = "model.pck"
pickle.dump(best_model, open(file_name, "wb"))