# Model

In [1]:
import os
import sys

import pathlib

notebook_path = pathlib.Path(os.getcwd())
sys.path.append(str(notebook_path.parent))

In [2]:
import numpy as np
import pandas as pd

import pickle

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV
)
from category_encoders import TargetEncoder

from IPython.display import HTML

from database import engine

## Loading data

- We only load records that belong to the `view` category, because every like starts with view, and we have a `target` column that marks those views that lead to likes.

In [3]:
user_data = pd.read_sql(
    f"SELECT * FROM public.user_data LIMIT 100;",
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    f"SELECT * FROM public.post_text_df LIMIT 100;",
    con = engine,
    index_col = "post_id"
)
feed_data = pd.read_sql(
    f"SELECT * FROM public.feed_data limit 100;",
    con = engine
)

Sometimes this frame loads too slowly, so I use a temporary local file that stores the data locally.

In [4]:
joined_data = pd.read_parquet("preloaded_joined_data.parquet").sample(100000, random_state=10)

In [5]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data,
    "Joined data" : joined_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,text,topic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,UK economy facing major risks\n\nThe UK manufa...,business
2,Aids and climate top Davos agenda\n\nClimate c...,business
3,Asian quake hits European shares\n\nShares in ...,business
4,India power shares jump on debut\n\nShares in ...,business
5,Lacroix label bought by US firm\n\nLuxury good...,business


Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-10-24 22:16:09,41783,6161,view,0
1,2021-10-24 22:16:48,41783,2644,view,1
2,2021-10-24 22:17:11,41783,2644,like,0
3,2021-12-27 21:29:18,161979,1264,like,0
4,2021-12-27 21:29:20,161979,1352,view,0


Unnamed: 0,user_id,timestamp,target,gender,age,country,city,exp_group,os,source,text,topic
71213,160438,2021-11-25 17:17:56,0,1,23,Russia,Kaluga,3,iOS,organic,Irish company hit by Iraqi report\n\nShares in...,business
381916,104477,2021-10-21 07:04:56,0,0,23,Russia,Bol’shaya Yelkhovka,2,iOS,ads,this is the 4th movie in the Karate Kid series...,movie
423558,161074,2021-11-16 15:14:57,0,1,29,Russia,Kirishi,2,Android,organic,"The whole town of Blackstone is afraid, becaus...",movie
62340,113212,2021-11-26 11:27:00,0,1,18,Russia,Kazan,1,Android,organic,"I grew up watching, and loving this cartoon ev...",movie
466131,19018,2021-12-21 19:31:58,0,0,20,Russia,Perm,1,iOS,ads,The Movie is okay. Meaning that I dont regret ...,movie


## Model pipeline

In [6]:
X = joined_data.drop("user_id", axis = 1).copy()

y = X["target"]
X.drop("target", axis = 1, inplace = True)

X["month"] = X["timestamp"].dt.month
X["year"] = X["timestamp"].dt.year
X["hour"] = X["timestamp"].dt.hour
X.drop("timestamp", axis = 1, inplace = True)

X['gender'] = X['gender'].astype("O")

In [7]:
numeric_columns = list(X.select_dtypes("number").columns)
categorical_columns = [
    'gender', 'country', 'city', 'os', 'source', 'topic'
]
cat_nunique = X[categorical_columns].nunique()
MTE_columns = cat_nunique.index[cat_nunique > 5].to_list()
OHE_columns = list(set(categorical_columns) - set(MTE_columns))
text_columns = 'text'

In [8]:
data_transformer = ColumnTransformer([
    ("stand_scaler", StandardScaler(), numeric_columns),
    (
        "one_hot_encod", 
        OneHotEncoder(
            categories = list(X[OHE_columns].apply(
                lambda col: list(col.unique()),
                result_type="reduce"
            ))
        ), 
        OHE_columns
    ),
    (
        "mean_target_encoder", 
        TargetEncoder(min_samples_leaf = 0.5, smoothing = 0.5), 
        MTE_columns
    ),
    ("tf_idf_vector", TfidfVectorizer(max_features = 10), text_columns)
])

In [None]:
pipeline = Pipeline([
    ("transfmer", data_transformer),
    ("model", GradientBoostingClassifier())
])

## Model selection

In [22]:
model = pipeline.fit(X, y)
file_name = "model.pck"
pickle.dump(model, open(file_name, "wb"))

In [25]:
pred = model.predict_proba(X)

In [29]:
# param_grid = {
#     "model__learning_rate" : [0.05, 0.1, 0.2],
#     "model__max_depth" : np.arange(4, 20, 5),
#     "model__n_estimators" : [10, 20, 30]
# }
param_grid = {
    "model__learning_rate" : [0.05, 0.1]
}

grid_search_result = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    verbose = 2,
    scoring = "roc_auc",
    return_train_score = True
).fit(X, y)
pickle.dump(grid_search_result, open("gs_results.pck", "wb"))

In [37]:
grid_search_result = pickle.load(open("gs_results.pck", "rb"))

In [14]:
best_model = pipeline.set_params(
    **grid_search_result.best_params_
).fit(X, y)

file_name = "model.pck"
pickle.dump(best_model, open(file_name, "wb"))

NameError: name 'grid_search_result' is not defined

## Data for checker

I need to save precomputed data to the checker database - it needs features that I have generated for the model.

In [23]:
ready_features = X.copy()
# it needs features that my model will need and
# user_id column from original dataframe
ready_features["user_id"] = joined_data["user_id"]
ready_features.to_sql(
    con = engine,
    name = "fedor_kobak_features_lesson_22",
    if_exists = "replace"
)

1000

In [30]:
pd.read_sql(
    f"SELECT * FROM public.user_data;",
    con = engine
)

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic
163201,168549,0,18,Russia,Tula,2,Android,organic
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic
163203,168551,0,38,Russia,Moscow,3,iOS,organic


In [34]:
pd.read_sql(
    f"SELECT * FROM public.post_text_df;",
    con = engine,
    index_col= "post_id"
)

Unnamed: 0_level_0,text,topic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,UK economy facing major risks\n\nThe UK manufa...,business
2,Aids and climate top Davos agenda\n\nClimate c...,business
3,Asian quake hits European shares\n\nShares in ...,business
4,India power shares jump on debut\n\nShares in ...,business
5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...
7315,"OK, I would not normally watch a Farrelly brot...",movie
7316,I give this movie 2 stars purely because of it...,movie
7317,I cant believe this film was allowed to be mad...,movie
7318,The version I saw of this film was the Blockbu...,movie
