# Model

In [1]:
import pandas as pd
from database import engine

from IPython.display import HTML

## Loading data

In [25]:
user_data = pd.read_sql(
    "SELECT * FROM public.user_data LIMIT 10000;", 
    con = engine,
    index_col = "user_id"
)
post_data = pd.read_sql(
    "SELECT * FROM public.post_text_df LIMIT 10000;",
    con = engine,
    index_col = "post_id"
)
feed_data = pd.read_sql(
    "SELECT * FROM public.feed_data limit 10000;",
    con = engine
)

# joined data is where each user 
# is matched with all his actions
query = """
SELECT 
    public.feed_data.timestamp,
    public.feed_data.user_id,
    public.feed_data.post_id,
    public.feed_data.action,
    public.feed_data.target,
    public.user_data.gender,
    public.user_data.age,
    public.user_data.country,
    public.user_data.city,
    public.user_data.exp_group,
    public.user_data.os,
    public.user_data.source,
    public.post_text_df.text,
    public.post_text_df.topic
FROM public.feed_data
LEFT JOIN public.user_data
    ON public.feed_data.user_id = public.user_data.user_id
LEFT JOIN public.post_text_df
    ON public.feed_data.post_id = public.post_text_df.post_id
LIMIT 10000;
"""
joined_data = pd.read_sql(query, con = engine)

In [32]:
df_show = {
    "Users data" : user_data,
    "Post data" : post_data,
    "Feed data" : feed_data,
    "Joined data" : joined_data
}

for title, df in df_show.items():
    display(HTML(f"<h3>{title}</h3>"))
    display(df.head())

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads
201,0,37,Russia,Abakan,0,Android,ads
202,1,17,Russia,Smolensk,4,Android,ads
203,0,18,Russia,Moscow,1,iOS,ads
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


Unnamed: 0_level_0,text,topic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,UK economy facing major risks\n\nThe UK manufa...,business
2,Aids and climate top Davos agenda\n\nClimate c...,business
3,Asian quake hits European shares\n\nShares in ...,business
4,India power shares jump on debut\n\nShares in ...,business
5,Lacroix label bought by US firm\n\nLuxury good...,business


Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-10-08 07:22:16,7776,1851,view,0
1,2021-10-08 07:24:28,7776,1688,view,0
2,2021-10-08 07:27:26,7776,4180,view,0
3,2021-10-08 07:27:43,7776,1471,view,0
4,2021-10-08 07:29:23,7776,6040,view,0


Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic
0,2021-12-27 13:38:59,115963,1300,view,0,0,34,Ukraine,Alchevsk,3,iOS,organic,Brown hits back in Blair rift row\n\nGordon Br...,politics
1,2021-12-27 13:40:06,115963,1960,view,0,0,34,Ukraine,Alchevsk,3,iOS,organic,Apple sues to stop product leaks\n\nComputer f...,tech
2,2021-12-27 13:40:38,115963,1901,view,0,0,34,Ukraine,Alchevsk,3,iOS,organic,Slovakia seal Hopman Cup success\n\nSlovakia c...,sport
3,2021-12-27 13:43:30,115963,1457,view,0,0,34,Ukraine,Alchevsk,3,iOS,organic,Navratilova hits out at critics\n\nMartina Nav...,sport
4,2021-12-27 13:45:58,115963,1662,view,0,0,34,Ukraine,Alchevsk,3,iOS,organic,Desailly backs Blues revenge trip\n\nMarcel De...,sport


## EDA

Actions types.

In [4]:
pd.Series(
    feed_data["action"].value_counts(),
    name = "number"
).to_frame()

Unnamed: 0,number
view,9133
like,867


Let's check if record about like exclude record about view. Results show that it doesn't, so I need to exclude "view" records for any observation that has the same "like" record.

In [5]:
view_like_combs= feed_data.groupby(["user_id", "post_id"])["action"].apply(
    lambda actions: pd.Series({
        "like" : (actions == "like").any(),
        "view" : (actions == "view").any()
    })
).unstack()
pd.crosstab(
    view_like_combs["like"],
    view_like_combs["view"]
)

view,True
like,Unnamed: 1_level_1
False,7932
True,862
