In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, f1_score, roc_auc_score

from heapq import nlargest
import random

In [4]:
engine = create_engine(
    "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
    "postgres.lab.karpov.courses:6432/startml"
)

with engine.connect() as conn:
    user_df = pd.read_sql(
        sql="SELECT * FROM public.user_data",
        con=conn.connection
    )

with engine.connect() as conn:
    post_df = pd.read_sql(
        sql="SELECT * FROM public.post_text_df",
        con=conn.connection
    )

with engine.connect() as conn:
    feed_df = pd.read_sql(
        sql="SELECT * FROM public.feed_data LIMIT 1000000",
        con=conn.connection
    )



In [5]:
user_df.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [6]:
cols_for_ohe = ['gender', 'country', 'exp_group', 'os','source']
X = user_df[cols_for_ohe]

In [7]:
OHE = OneHotEncoder(drop='first')
X_trans = OHE.fit_transform(X).toarray()

col_names = OHE.get_feature_names_out()
OHE_user = pd.DataFrame(X_trans, columns = col_names)

OHE_user = OHE_user.astype(int)
user_df = pd.concat([user_df['user_id'], user_df['age'], OHE_user], axis=1)

user_df.head()

Unnamed: 0,user_id,age,gender_1,country_Belarus,country_Cyprus,country_Estonia,country_Finland,country_Kazakhstan,country_Latvia,country_Russia,country_Switzerland,country_Turkey,country_Ukraine,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic
0,200,34,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
1,201,37,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,202,17,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,203,18,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
4,204,36,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [8]:
post_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [9]:
len_text = post_df['text'].apply(lambda x: len(x))

post_df['text_len'] = len_text

post_df.head()

Unnamed: 0,post_id,text,topic,text_len
0,1,UK economy facing major risks\n\nThe UK manufa...,business,1967
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,2701
2,3,Asian quake hits European shares\n\nShares in ...,business,3408
3,4,India power shares jump on debut\n\nShares in ...,business,1026
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,889


In [10]:
def tfidf_vectorization(dataset: pd.DataFrame, to_vector: str):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(dataset[to_vector])

    max_tfidf_values = tfidf_matrix.max(axis=1).toarray().flatten()
    avg_tfidf_values = np.ravel(tfidf_matrix.mean(axis=1).flatten())

    dataset['max_tfidf'] = max_tfidf_values
    # dataset['avg_tfidf'] = avg_tfidf_values

    return dataset

tmp = tfidf_vectorization(post_df, to_vector = 'text')
post_df = tmp.drop('text', axis=1).copy()

post_df.head()

Unnamed: 0,post_id,topic,text_len,max_tfidf
0,1,business,1967,0.439495
1,2,business,2701,0.290946
2,3,business,3408,0.279045
3,4,business,1026,0.525321
4,5,business,889,0.409826


In [11]:
feed_df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-12-03 19:05:08,150004,3460,view,0
1,2021-12-03 19:07:31,150004,6465,view,0
2,2021-12-03 19:09:46,150004,5525,view,0
3,2021-12-03 19:10:52,150004,1322,view,0
4,2021-12-03 19:12:22,150004,1444,view,0


In [None]:
# feed_df = feed_df[feed_df['action'] == 'view']

# feed_df = feed_df.sort_values('timestamp').reset_index(drop=True)

# feed_df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-10-01 06:01:40,1859,1498,view,1
1,2021-10-01 06:02:01,1859,6881,view,0
2,2021-10-01 06:04:29,1859,4873,view,0
3,2021-10-01 06:05:36,1859,1846,view,0
4,2021-10-01 06:05:49,1859,3688,view,0


In [12]:
df = feed_df.merge(user_df, how='left').merge(post_df, how='left')

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,age,gender_1,country_Belarus,country_Cyprus,country_Estonia,...,country_Ukraine,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,topic,text_len,max_tfidf
0,2021-12-03 19:05:08,150004,3460,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,covid,140,0.441784
1,2021-12-03 19:07:31,150004,6465,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,movie,851,0.221904
2,2021-12-03 19:09:46,150004,5525,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,movie,278,0.262718
3,2021-12-03 19:10:52,150004,1322,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,politics,1822,0.511438
4,2021-12-03 19:12:22,150004,1444,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,sport,1522,0.369093


In [13]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,age,gender_1,country_Belarus,country_Cyprus,country_Estonia,...,country_Ukraine,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,topic,text_len,max_tfidf
0,2021-12-03 19:05:08,150004,3460,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,covid,140,0.441784
1,2021-12-03 19:07:31,150004,6465,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,movie,851,0.221904
2,2021-12-03 19:09:46,150004,5525,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,movie,278,0.262718
3,2021-12-03 19:10:52,150004,1322,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,politics,1822,0.511438
4,2021-12-03 19:12:22,150004,1444,view,0,40,1,0,0,0,...,0,0,1,0,0,0,1,sport,1522,0.369093


In [14]:
def parse_time(time, type_):
    if type_ == 'month':
      return time.month
    if type_ == 'day':
      return time.day
    if type_ == 'hour':
      return time.hour
    if type_ == 'minute':
      return time.minute

def parse_date_time(data):
    data['month'] = data['timestamp'].apply(parse_time, type_='month')
    data['day'] = data['timestamp'].apply(parse_time, type_='day')
    data['hour'] = data['timestamp'].apply(parse_time, type_='hour')
    data['minute'] = data['timestamp'].apply(parse_time, type_='minute')

    data['month'] = data['month'].astype (object)
    data['day'] = data['day'].astype (object)
    data['hour'] = data['hour'].astype (object)
    data['minute'] = data['minute'].astype (object)
    return data

In [15]:
df = parse_date_time(df)

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,age,gender_1,country_Belarus,country_Cyprus,country_Estonia,...,exp_group_4,os_iOS,source_organic,topic,text_len,max_tfidf,month,day,hour,minute
0,2021-12-03 19:05:08,150004,3460,view,0,40,1,0,0,0,...,0,0,1,covid,140,0.441784,12,3,19,5
1,2021-12-03 19:07:31,150004,6465,view,0,40,1,0,0,0,...,0,0,1,movie,851,0.221904,12,3,19,7
2,2021-12-03 19:09:46,150004,5525,view,0,40,1,0,0,0,...,0,0,1,movie,278,0.262718,12,3,19,9
3,2021-12-03 19:10:52,150004,1322,view,0,40,1,0,0,0,...,0,0,1,politics,1822,0.511438,12,3,19,10
4,2021-12-03 19:12:22,150004,1444,view,0,40,1,0,0,0,...,0,0,1,sport,1522,0.369093,12,3,19,12


In [51]:
df = df.drop('timestamp', axis=1)

In [18]:
df = df.drop('action', axis=1)

In [19]:
X_train = df[:700000]
X_test = df[700000:]

y_train = X_train['target']
X_train = X_train.drop('target', axis=1)

y_test = X_test['target']
X_test = X_test.drop('target', axis=1)


print(X_train.shape, X_test.shape)

(700000, 28) (300000, 28)


In [20]:
cat_features = ['topic']
catboost_pool = Pool(X_train, y_train, cat_features=cat_features)


task_type="GPU",
model_test = CatBoostClassifier(iterations=500,
                                    learning_rate=0.02,
                                    task_type="GPU",
                                    cat_features=cat_features,
                                    verbose=50)



model_test.fit(catboost_pool)

0:	learn: 0.6761121	total: 22.4ms	remaining: 11.2s
50:	learn: 0.3646749	total: 1.17s	remaining: 10.3s
100:	learn: 0.3338279	total: 2.31s	remaining: 9.11s
150:	learn: 0.3296708	total: 3.47s	remaining: 8.01s
200:	learn: 0.3286579	total: 4.59s	remaining: 6.83s
250:	learn: 0.3281633	total: 5.71s	remaining: 5.67s
300:	learn: 0.3277057	total: 6.82s	remaining: 4.51s
350:	learn: 0.3273081	total: 7.95s	remaining: 3.37s
400:	learn: 0.3269901	total: 9.09s	remaining: 2.24s
450:	learn: 0.3267385	total: 10.2s	remaining: 1.11s
499:	learn: 0.3265201	total: 11.3s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1b14739fac0>

In [21]:
roc_auc_score(y_test, model_test.predict_proba(X_test)[:, 1])

0.6130867069099397

In [22]:
model_test.score(X_test, y_test)

0.89197

In [23]:
model_test.save_model('catboost_model',
                           format="cbm")

from_file = CatBoostClassifier()  # здесь не указываем параметры, которые были при обучении, в дампе модели все есть

from_file.load_model('catboost_model')

from_file.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [87]:
col_to_drop = pd.DataFrame({'feature_importance': model_test.get_feature_importance(),
              'feature_names': X_train.columns}).sort_values(by=['feature_importance'],
                                                           ascending=False)['feature_names'][10:].values

col_to_drop

array(['text_len', 'exp_group_3', 'minute', 'exp_group_4',
       'country_Ukraine', 'country_Turkey', 'day', 'country_Belarus',
       'os_iOS', 'max_tfidf', 'country_Kazakhstan', 'country_Finland',
       'source_organic', 'country_Cyprus', 'country_Latvia',
       'country_Switzerland', 'country_Estonia'], dtype=object)

In [88]:
df.drop(col_to_drop, axis=1, inplace=True)

In [89]:
df.head()

Unnamed: 0,user_id,post_id,target,age,gender_1,country_Russia,exp_group_1,exp_group_2,topic,month,hour
0,41758,1700,0,23,0,1,1,0,sport,10,6
1,58301,1189,0,22,0,1,0,0,politics,10,6
2,20793,174,0,15,1,1,0,1,business,10,6
3,41758,982,0,23,0,1,1,0,politics,10,6
4,41758,3699,0,23,0,1,1,0,covid,10,6


In [24]:
X_train = df[:700000]
X_test = df[700000:]

y_train = X_train['target']
X_train = X_train.drop('target', axis=1)

y_test = X_test['target']
X_test = X_test.drop('target', axis=1)

In [94]:
cat_features = ['topic']
catboost_pool = Pool(X_train, y_train, cat_features=cat_features)


task_type="GPU",
model_test = CatBoostClassifier(iterations=500,
                                    learning_rate=0.02,
                                    task_type="GPU",
                                    cat_features=cat_features,
                                    verbose=50)



model_test.fit(catboost_pool)

0:	learn: 0.6770162	total: 23.6ms	remaining: 11.8s
50:	learn: 0.3834851	total: 1.04s	remaining: 9.19s
100:	learn: 0.3532639	total: 2.12s	remaining: 8.39s
150:	learn: 0.3488448	total: 3.24s	remaining: 7.48s
200:	learn: 0.3477984	total: 4.35s	remaining: 6.47s
250:	learn: 0.3471750	total: 5.48s	remaining: 5.43s
300:	learn: 0.3466544	total: 6.55s	remaining: 4.33s
350:	learn: 0.3463033	total: 7.65s	remaining: 3.25s
400:	learn: 0.3460380	total: 8.74s	remaining: 2.16s
450:	learn: 0.3457941	total: 9.8s	remaining: 1.06s
499:	learn: 0.3455942	total: 10.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1cea953cc40>

In [96]:
roc_auc_score(y_test, model_test.predict_proba(X_test)[:, 1])

0.6162550489798376

In [28]:
df_feat_user = df[df['user_id'] == 150004]

df_feat_user_pred = df_feat_user.drop(['user_id', 'post_id'], axis=1)
df_feat_user['predict'] = from_file.predict_proba(df_feat_user_pred)[:, 1]

df_feat_user = df_feat_user.sort_values('predict', ascending=False)[:5]
rec_posts = df_feat_user['post_id'].to_list()

result_rec_df = post_df[post_df['post_id'].isin(rec_posts)].rename(columns={'post_id': 'id'})
result = result_rec_df.to_dict(orient='records')
result

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=20]="covid": Cannot convert 'b'covid'' to float

In [27]:
df[df['user_id'] == 150004]

Unnamed: 0,timestamp,user_id,post_id,target,age,gender_1,country_Belarus,country_Cyprus,country_Estonia,country_Finland,...,exp_group_4,os_iOS,source_organic,topic,text_len,max_tfidf,month,day,hour,minute
0,2021-12-03 19:05:08,150004,3460,0,40,1,0,0,0,0,...,0,0,1,covid,140,0.441784,12,3,19,5
1,2021-12-03 19:07:31,150004,6465,0,40,1,0,0,0,0,...,0,0,1,movie,851,0.221904,12,3,19,7
2,2021-12-03 19:09:46,150004,5525,0,40,1,0,0,0,0,...,0,0,1,movie,278,0.262718,12,3,19,9
3,2021-12-03 19:10:52,150004,1322,0,40,1,0,0,0,0,...,0,0,1,politics,1822,0.511438,12,3,19,10
4,2021-12-03 19:12:22,150004,1444,0,40,1,0,0,0,0,...,0,0,1,sport,1522,0.369093,12,3,19,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877,2021-12-28 10:54:29,150004,1720,0,40,1,0,0,0,0,...,0,0,1,sport,2296,0.200785,12,28,10,54
1878,2021-12-28 10:55:02,150004,4114,0,40,1,0,0,0,0,...,0,0,1,covid,140,0.394347,12,28,10,55
1879,2021-12-28 10:55:38,150004,3002,0,40,1,0,0,0,0,...,0,0,1,covid,138,0.442798,12,28,10,55
1880,2021-12-28 10:57:42,150004,1603,0,40,1,0,0,0,0,...,0,0,1,sport,1755,0.529655,12,28,10,57
