# Model

In [1]:
import numpy as np
import pandas as pd
from database import engine

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV
)

from IPython.display import HTML

## Loading data

- We only load records that belong to the `view` category, because every like starts with view, and we have a `target` column that marks those views that lead to likes.

In [2]:
records_count = 1000

user_data = pd.read_sql(
    f"SELECT * FROM public.user_data LIMIT {records_count};",
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    f"SELECT * FROM public.post_text_df LIMIT {records_count};",
    con = engine,
    index_col = "post_id"
)
feed_data = pd.read_sql(
    f"SELECT * FROM public.feed_data limit {records_count};",
    con = engine
)

# joined data is where each user 
# is matched with all his actions
query = f"""
SELECT 
    public.feed_data.timestamp,
    public.feed_data.target,
    public.user_data.gender,
    public.user_data.age,
    public.user_data.country,
    public.user_data.city,
    public.user_data.exp_group,
    public.user_data.os,
    public.user_data.source,
    public.post_text_df.text,
    public.post_text_df.topic
FROM public.feed_data
LEFT JOIN public.user_data
    ON public.feed_data.user_id = public.user_data.user_id
LEFT JOIN public.post_text_df
    ON public.feed_data.post_id = public.post_text_df.post_id
WHERE
    public.feed_data.action='view'
LIMIT {records_count};
"""
joined_data = pd.read_sql(query, con = engine)

In [3]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data,
    "Joined data" : joined_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,text,topic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,UK economy facing major risks\n\nThe UK manufa...,business
2,Aids and climate top Davos agenda\n\nClimate c...,business
3,Asian quake hits European shares\n\nShares in ...,business
4,India power shares jump on debut\n\nShares in ...,business
5,Lacroix label bought by US firm\n\nLuxury good...,business


Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-12-01 14:55:55,1636,1973,view,0
1,2021-12-01 14:57:53,1636,4665,view,0
2,2021-12-01 14:58:35,1636,3804,view,0
3,2021-10-09 15:56:36,99995,6585,view,0
4,2021-10-09 15:57:23,99995,5547,view,0


Unnamed: 0,timestamp,target,gender,age,country,city,exp_group,os,source,text,topic
0,2021-12-01 14:55:55,0,1,15,Russia,Moscow,0,Android,ads,Latest Opera browser gets vocal\n\nNet browser...,tech
1,2021-12-01 14:57:53,0,1,15,Russia,Moscow,0,Android,ads,This one is a very solid Randolph Scott Wester...,movie
2,2021-12-01 14:58:35,0,1,15,Russia,Moscow,0,Android,ads,"Some people are drug peddlers, some people sim...",covid
3,2021-10-09 15:56:36,0,0,59,Russia,Sochi,3,Android,ads,I went to go see this at the Esquire Theatre i...,movie
4,2021-10-09 15:57:23,0,0,59,Russia,Sochi,3,Android,ads,Siskel & Ebert were terrific on this show whet...,movie


## Transformation

In [4]:
X = joined_data.copy()

y = X["target"]
X.drop("target", axis = 1, inplace = True)

X["month"] = X["timestamp"].dt.month
X["year"] = X["timestamp"].dt.year
X["hour"] = X["timestamp"].dt.hour
X.drop("timestamp", axis = 1, inplace = True)

X['gender'] = X['gender'].astype("O")

In [5]:
numeric_columns = list(X.select_dtypes("number").columns)
categorical_columns = [
    'gender', 'country', 'city', 'os', 'source', 'topic'
]
text_columns = 'text'

In [6]:
data_transformer = ColumnTransformer([
    ("stand_scaler", StandardScaler(), numeric_columns),
    (
        "one_hot_encod", 
        OneHotEncoder(
            categories = list(X[categorical_columns].apply(lambda col: list(col.unique())))
        ), 
        categorical_columns
    ),
    ("tf_idf_vector", TfidfVectorizer(max_features = 10), text_columns)
])

## Model pipeline

In [7]:
pipeline = Pipeline([
    ("transfmer", data_transformer),
    ("model", GradientBoostingClassifier())
])

In [8]:
param_grid = {
    "model__learning_rate" : [0.05, 0.1, 0.2],
    "model__max_depth" : np.arange(4, 20, 3)
}

grid_search_result = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    verbose = 2
).fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ......model__learning_rate=0.05, model__max_depth=4; total time=   0.4s
[CV] END ......model__learning_rate=0.05, model__max_depth=4; total time=   0.4s
[CV] END ......model__learning_rate=0.05, model__max_depth=4; total time=   0.4s
[CV] END ......model__learning_rate=0.05, model__max_depth=4; total time=   0.4s
[CV] END ......model__learning_rate=0.05, model__max_depth=4; total time=   0.4s
[CV] END ......model__learning_rate=0.05, model__max_depth=7; total time=   0.6s
[CV] END ......model__learning_rate=0.05, model__max_depth=7; total time=   0.6s
[CV] END ......model__learning_rate=0.05, model__max_depth=7; total time=   0.6s
[CV] END ......model__learning_rate=0.05, model__max_depth=7; total time=   0.6s
[CV] END ......model__learning_rate=0.05, model__max_depth=7; total time=   0.6s
[CV] END .....model__learning_rate=0.05, model__max_depth=10; total time=   1.0s
[CV] END .....model__learning_rate=0.05, model__

In [14]:
roc_auc_score(
    y,
    grid_search_result.best_estimator_.predict_proba(X)[:,1]
)

0.9997522584808259

In [21]:
(
    grid_search_result.best_estimator_.predict(X) == y.to_numpy()
)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [15]:
grid_search_result.best_score_

0.5890000000000001