In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
user_df = pd.read_csv('new_user_data.csv')
post_df = pd.read_csv('new_post_text_df.csv')
feed_df = pd.read_csv('new_feed_data.csv')

In [3]:
user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult
...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult


In [4]:
post_df

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
0,1,business,0.005147,0.194684,0.026514,-0.073551,-0.149517,-0.025243,0.048232,-0.171348,...,0.002770,0.015150,-0.037546,0.014755,-0.030483,0.013588,0.020241,-0.023167,0.032114,0.021335
1,2,business,-0.000803,0.218085,0.067561,0.077333,-0.054572,-0.002843,0.005914,-0.026982,...,0.007674,-0.082471,0.014627,0.017087,0.022628,-0.034949,-0.038008,0.026565,0.010053,-0.009858
2,3,business,-0.005729,0.163478,0.016924,-0.098530,-0.153900,-0.024024,0.039049,-0.133561,...,-0.038628,0.027905,-0.016978,0.010790,0.050866,0.010315,-0.017937,-0.017863,-0.000048,0.023703
3,4,business,0.010938,0.168339,0.025062,-0.063091,-0.153456,-0.016477,0.054518,-0.080653,...,-0.023014,0.098884,0.018685,-0.017327,-0.031135,-0.045588,-0.033993,-0.009839,0.036338,0.011815
4,5,business,0.000350,0.122627,0.010034,-0.040646,-0.059208,-0.006179,-0.003250,-0.012157,...,0.010623,0.015857,0.023957,0.025522,-0.014381,0.012647,0.000772,-0.014178,0.012667,-0.005298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,movie,-0.164127,-0.151603,0.168022,-0.013667,0.019660,-0.006831,0.012408,0.029409,...,-0.024458,0.042370,-0.075578,-0.026011,0.026100,-0.024041,0.041855,-0.008036,0.016779,-0.000262
7019,7316,movie,-0.136358,-0.129865,0.106072,-0.004081,-0.055466,-0.004484,-0.006954,-0.005667,...,-0.027759,0.015210,-0.017865,0.009113,-0.002165,0.023063,-0.011472,0.035578,0.029828,0.028756
7020,7317,movie,-0.102380,-0.070634,-0.168157,0.079847,-0.034367,-0.001904,-0.012952,0.018657,...,-0.061015,0.022039,0.042510,-0.002182,-0.045331,-0.026317,0.020485,-0.027644,0.008355,0.059747
7021,7318,movie,-0.112783,-0.035907,-0.101419,0.058777,0.020218,-0.005854,0.033726,0.004224,...,-0.024241,-0.014465,-0.029016,0.009381,-0.003801,-0.025043,0.042122,0.061133,-0.022629,-0.024155


In [5]:
feed_df

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day
0,2021-12-09 13:36:48,816,3583,0,12,9,48,3,0,Afternoon
1,2021-12-09 13:36:50,816,477,0,12,9,50,3,0,Afternoon
2,2021-12-12 21:17:32,816,6927,0,12,12,32,6,1,Evening
3,2021-12-12 21:20:05,816,3455,0,12,12,5,6,1,Evening
4,2021-12-12 21:22:35,816,964,0,12,12,35,6,1,Evening
...,...,...,...,...,...,...,...,...,...,...
999995,2021-10-28 18:54:43,52191,1075,0,10,28,43,3,0,Evening
999996,2021-10-28 18:56:54,52191,1077,0,10,28,54,3,0,Evening
999997,2021-10-28 18:59:07,52191,1725,1,10,28,7,3,0,Evening
999998,2021-10-28 19:01:24,52191,1725,0,10,28,24,3,0,Evening



CREATING CLUSTERS FOR USER_ID WITH K-MEANS

In [6]:
X = user_df.drop('user_id', axis=1)

In [7]:
cat_cols = ['gender', 'country', 'exp_group', 'os', 'source', 'category_of_age']
cat_cols

['gender', 'country', 'exp_group', 'os', 'source', 'category_of_age']

In [8]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [9]:
city_frequencies = X['city'].value_counts(normalize=True)

city_frequencies

Moscow              0.134028
Saint Petersburg    0.042303
Novosibirsk         0.012175
Yekaterinburg       0.011789
Nizhniy Novgorod    0.010000
                      ...   
Hoshcha             0.000006
Novyy Buyan         0.000006
Tbilisskaya         0.000006
Levaya Rossosh’     0.000006
Kemer               0.000006
Name: city, Length: 3915, dtype: float64

In [10]:
X['city'] = X['city'].map(city_frequencies)

In [11]:
model = KMeans(n_clusters=50).fit(X)



In [12]:
clusters_df = pd.DataFrame(model.labels_, columns=['cluster_feature'])

clusters_df

Unnamed: 0,cluster_feature
0,43
1,3
2,41
3,11
4,48
...,...
163200,11
163201,18
163202,43
163203,37


In [13]:
user_df = pd.concat([user_df, clusters_df], axis=1)

user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,43
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,3
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,41
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,11
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,48
...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,11
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,18
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,43
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,37


MERGE DATAFRAMES

In [15]:
df = pd.merge(feed_df, post_df, on='post_id', how='left')

df

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
0,2021-12-09 13:36:48,816,3583,0,12,9,48,3,0,Afternoon,...,0.028606,0.062033,-0.011754,0.003764,-0.047687,-0.022450,0.019177,0.028945,0.018407,-0.040994
1,2021-12-09 13:36:50,816,477,0,12,9,50,3,0,Afternoon,...,-0.001549,0.061922,0.004632,-0.010501,-0.019731,0.019998,-0.005925,-0.058419,-0.019120,0.024145
2,2021-12-12 21:17:32,816,6927,0,12,12,32,6,1,Evening,...,-0.057207,0.017872,-0.008643,0.016590,-0.017416,-0.014586,0.045796,0.004668,0.012991,-0.018231
3,2021-12-12 21:20:05,816,3455,0,12,12,5,6,1,Evening,...,-0.006210,0.024692,-0.010493,-0.006134,0.000175,0.005935,0.012127,0.003405,-0.003616,-0.004326
4,2021-12-12 21:22:35,816,964,0,12,12,35,6,1,Evening,...,-0.043859,-0.001941,-0.013335,0.007351,-0.036754,-0.037814,-0.001082,0.003323,0.015498,-0.014616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2021-10-28 18:54:43,52191,1075,0,10,28,43,3,0,Evening,...,-0.023502,-0.038471,-0.011976,0.032740,-0.005511,0.021473,0.010941,-0.023156,-0.022425,0.028244
999996,2021-10-28 18:56:54,52191,1077,0,10,28,54,3,0,Evening,...,-0.093124,-0.109188,-0.020613,-0.017821,0.079145,0.078670,-0.035835,-0.026764,-0.044464,-0.026915
999997,2021-10-28 18:59:07,52191,1725,1,10,28,7,3,0,Evening,...,0.043594,-0.020829,0.015224,-0.022932,-0.026483,-0.018893,-0.024991,-0.002659,-0.024553,-0.020046
999998,2021-10-28 19:01:24,52191,1725,0,10,28,24,3,0,Evening,...,0.043594,-0.020829,0.015224,-0.022932,-0.026483,-0.018893,-0.024991,-0.002659,-0.024553,-0.020046


In [16]:
# Create Dataframe with mean of all features for each actions of users
df_to_merge = df.groupby('user_id').mean().drop(['target', 'post_id'], axis=1)

df_to_merge.head()

Unnamed: 0_level_0,month,day,second,weekday,is_weekend,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
816,12.0,18.760274,29.328767,4.082192,0.527397,-0.02143,0.019337,0.007187,0.019173,0.011933,...,-0.001047,-0.001362,-0.003027,-0.001193,0.001818,0.002744,0.002591,-0.001812,0.001164,-0.003858
817,11.125352,13.809859,30.267606,3.184507,0.242254,0.001084,0.019412,-0.005977,0.004509,0.011457,...,-0.000631,0.001039,0.000622,-2.5e-05,0.000229,0.001403,-0.000153,-0.002809,-0.001405,-0.000875
818,10.626582,19.582278,29.71519,3.582278,0.189873,0.015763,0.007445,0.000692,0.008721,0.00981,...,-0.000281,0.003912,0.001353,-0.000915,-0.003726,0.000638,-0.000788,0.001874,0.006334,-0.001002
819,10.923557,14.978159,29.564743,1.792512,0.048362,-0.000446,0.007173,-0.008729,-0.001163,0.007379,...,-0.002109,-0.000236,0.001353,-0.000963,0.000497,0.000687,0.001588,0.000206,-0.001234,0.001174
820,11.375,17.348214,30.232143,3.46875,0.415179,0.003444,0.04206,0.009671,0.004807,0.028665,...,-0.000273,-0.002272,0.000491,-0.003816,-0.004596,-0.002085,-0.003088,0.001216,-0.000941,-0.002418


In [17]:
# for categorical features in this dataframe we'll fill with the most popular values

def compute_mode(x):
    return x.mode().iloc[0]

additional_cat_features = df.groupby('user_id').agg({
    'post_id': compute_mode,
    'part_of_day': compute_mode,
    'topic': compute_mode
}).reset_index()
additional_cat_features

Unnamed: 0,user_id,post_id,part_of_day,topic
0,816,923,Evening,movie
1,817,796,Evening,movie
2,818,174,Morning,movie
3,819,3090,Morning,movie
4,820,1761,Morning,covid
...,...,...,...,...
2142,155069,1369,Afternoon,movie
2143,155070,1723,Morning,movie
2144,155071,1865,Morning,movie
2145,155072,3775,Evening,movie


In [18]:
df_to_merge = df_to_merge.merge(additional_cat_features, on='user_id', how='left')

df_to_merge

Unnamed: 0,user_id,month,day,second,weekday,is_weekend,feature_1,feature_2,feature_3,feature_4,...,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,post_id,part_of_day,topic
0,816,12.000000,18.760274,29.328767,4.082192,0.527397,-0.021430,0.019337,0.007187,0.019173,...,-0.001193,0.001818,0.002744,0.002591,-0.001812,0.001164,-0.003858,923,Evening,movie
1,817,11.125352,13.809859,30.267606,3.184507,0.242254,0.001084,0.019412,-0.005977,0.004509,...,-0.000025,0.000229,0.001403,-0.000153,-0.002809,-0.001405,-0.000875,796,Evening,movie
2,818,10.626582,19.582278,29.715190,3.582278,0.189873,0.015763,0.007445,0.000692,0.008721,...,-0.000915,-0.003726,0.000638,-0.000788,0.001874,0.006334,-0.001002,174,Morning,movie
3,819,10.923557,14.978159,29.564743,1.792512,0.048362,-0.000446,0.007173,-0.008729,-0.001163,...,-0.000963,0.000497,0.000687,0.001588,0.000206,-0.001234,0.001174,3090,Morning,movie
4,820,11.375000,17.348214,30.232143,3.468750,0.415179,0.003444,0.042060,0.009671,0.004807,...,-0.003816,-0.004596,-0.002085,-0.003088,0.001216,-0.000941,-0.002418,1761,Morning,covid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2142,155069,11.080000,14.426207,28.908966,3.542069,0.446897,0.005846,0.007955,0.007598,0.008812,...,-0.000742,-0.001202,-0.001360,-0.000073,-0.002543,0.000814,0.000669,1369,Afternoon,movie
2143,155070,11.047059,20.282353,28.017647,2.941176,0.341176,0.001788,0.009166,0.008694,0.002834,...,-0.000646,-0.001598,0.002009,-0.000206,-0.001317,0.002340,-0.005566,1723,Morning,movie
2144,155071,11.113721,11.734240,28.975278,3.195303,0.341162,0.000009,0.038241,0.011907,0.016539,...,0.003060,0.001031,-0.000454,0.000262,-0.001679,-0.000253,-0.002002,1865,Morning,movie
2145,155072,10.535019,16.754864,29.690661,3.031128,0.336576,0.002490,0.014675,0.008608,0.011936,...,0.000140,-0.001137,0.000793,-0.004527,-0.000750,-0.003252,-0.001754,3775,Evening,movie


In [19]:
features_df = user_df.merge(df_to_merge, on='user_id', how='left')

features_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,...,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,post_id,part_of_day,topic
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,...,,,,,,,,,,
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,...,,,,,,,,,,
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,...,,,,,,,,,,
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,...,,,,,,,,,,
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,...,,,,,,,,,,
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,...,,,,,,,,,,
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,...,,,,,,,,,,
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,...,,,,,,,,,,


In [20]:
train_df = pd.merge(df, user_df, on='user_id', how='left')

train_df

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day,...,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature
0,2021-12-09 13:36:48,816,3583,0,12,9,48,3,0,Afternoon,...,0,19,Russia,Perm,3,Android,ads,623,young,1
1,2021-12-09 13:36:50,816,477,0,12,9,50,3,0,Afternoon,...,0,19,Russia,Perm,3,Android,ads,623,young,1
2,2021-12-12 21:17:32,816,6927,0,12,12,32,6,1,Evening,...,0,19,Russia,Perm,3,Android,ads,623,young,1
3,2021-12-12 21:20:05,816,3455,0,12,12,5,6,1,Evening,...,0,19,Russia,Perm,3,Android,ads,623,young,1
4,2021-12-12 21:22:35,816,964,0,12,12,35,6,1,Evening,...,0,19,Russia,Perm,3,Android,ads,623,young,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2021-10-28 18:54:43,52191,1075,0,10,28,43,3,0,Evening,...,0,15,Russia,Volgograd,1,Android,ads,422,young,2
999996,2021-10-28 18:56:54,52191,1077,0,10,28,54,3,0,Evening,...,0,15,Russia,Volgograd,1,Android,ads,422,young,2
999997,2021-10-28 18:59:07,52191,1725,1,10,28,7,3,0,Evening,...,0,15,Russia,Volgograd,1,Android,ads,422,young,2
999998,2021-10-28 19:01:24,52191,1725,0,10,28,24,3,0,Evening,...,0,15,Russia,Volgograd,1,Android,ads,422,young,2



FILL MISSING VALUES WITH POPULAR VALUES BASED ON CLUSTER

In [21]:
def fill_mode(x):
    mode = x.mode().iloc[0]
    return x.fillna(mode)

In [23]:
missing_cols = ['month',
 'day',
 'second',
 'weekday',
 'is_weekend',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'post_id',
 'part_of_day',
 'topic']

In [24]:
features_df[missing_cols] = features_df.groupby('cluster_feature')[missing_cols].transform(fill_mode)

features_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,...,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,post_id,part_of_day,topic
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,...,,,,,,,,10.0,Evening,movie
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,...,,,,,,,,98.0,Evening,movie
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,...,,,,,,,,37.0,Morning,movie
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,...,,,,,,,,22.0,Evening,movie
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,...,,,,,,,,74.0,Morning,movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,...,,,,,,,,22.0,Evening,movie
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,...,,,,,,,,165.0,Afternoon,movie
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,...,,,,,,,,10.0,Evening,movie
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,...,,,,,,,,1.0,Afternoon,movie


In [25]:
features_df['post_id'] = features_df['post_id'].astype(int)

features_df['post_id']

0          10
1          98
2          37
3          22
4          74
         ... 
163200     22
163201    165
163202     10
163203      1
163204    165
Name: post_id, Length: 163205, dtype: int32

In [26]:
for i in train_df.columns:
    if i not in features_df.columns:
        print(i)

timestamp
target


In [27]:
additional_cols = ['timestamp', 'target']
place_of_feature = [i for i in features_df.columns]
for i in additional_cols:
    place_of_feature.append(i)

len(place_of_feature)

71

In [28]:
place_for_features_columns = ['user_id', 'post_id', 'gender', 'age', 'country', 'city',
                                  'exp_group', 'os', 'source', 'count_actions', 'category_of_age',
                                  'cluster_feature', 'month', 'day', 'second', 'weekday', 'is_weekend',
                                  'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
                                  'part_of_day', 'topic', 'timestamp', 'target']

In [29]:
train_df.columns

Index(['timestamp', 'user_id', 'post_id', 'target', 'month', 'day', 'second',
       'weekday', 'is_weekend', 'part_of_day', 'topic', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11',
       'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16',
       'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21',
       'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26',
       'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31',
       'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36',
       'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
       'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46',
       'feature_47', 'feature_48', 'feature_49', 'feature_50', 'gender', 'age',
       'country', 'city', 'exp_group', 'os', 'source', 'count_actions',
       'category

In [30]:
train_df = train_df[place_for_features_columns]

train_df.columns

Index(['user_id', 'post_id', 'gender', 'age', 'country', 'city', 'exp_group',
       'os', 'source', 'count_actions', 'category_of_age', 'cluster_feature',
       'month', 'day', 'second', 'weekday', 'is_weekend', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'part_of_day',
       'topic', 'timestamp', 'target'],
      dtype='object')


SPLITTING 2 TABLES TO FEATURES FOR ALL USER AND FEATURE FOR ALL POSTS

In [31]:
user_features_df = features_df

user_features_df.columns

Index(['user_id', 'gender', 'age', 'country', 'city', 'exp_group', 'os',
       'source', 'count_actions', 'category_of_age', 'cluster_feature',
       'month', 'day', 'second', 'weekday', 'is_weekend', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11',
       'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16',
       'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21',
       'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26',
       'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31',
       'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36',
       'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
       'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46',
       'feature_47', 'feature_48', 'feature_49', 'feature_50', 'post_id',
       'part_of_day', 'to

In [32]:
post_features_df = post_df

post_features_df.head()

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
0,1,business,0.005147,0.194684,0.026514,-0.073551,-0.149517,-0.025243,0.048232,-0.171348,...,0.00277,0.01515,-0.037546,0.014755,-0.030483,0.013588,0.020241,-0.023167,0.032114,0.021335
1,2,business,-0.000803,0.218085,0.067561,0.077333,-0.054572,-0.002843,0.005914,-0.026982,...,0.007674,-0.082471,0.014627,0.017087,0.022628,-0.034949,-0.038008,0.026565,0.010053,-0.009858
2,3,business,-0.005729,0.163478,0.016924,-0.09853,-0.1539,-0.024024,0.039049,-0.133561,...,-0.038628,0.027905,-0.016978,0.01079,0.050866,0.010315,-0.017937,-0.017863,-4.8e-05,0.023703
3,4,business,0.010938,0.168339,0.025062,-0.063091,-0.153456,-0.016477,0.054518,-0.080653,...,-0.023014,0.098884,0.018685,-0.017327,-0.031135,-0.045588,-0.033993,-0.009839,0.036338,0.011815
4,5,business,0.00035,0.122627,0.010034,-0.040646,-0.059208,-0.006179,-0.00325,-0.012157,...,0.010623,0.015857,0.023957,0.025522,-0.014381,0.012647,0.000772,-0.014178,0.012667,-0.005298


In [33]:
columns_for_drop = [i for i in post_df.columns]

user_features_df.drop(columns_for_drop, axis=1, inplace=True)

user_features_df.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature,month,day,second,weekday,is_weekend,part_of_day
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,43,10.684211,9.012788,28.239295,2.250627,0.067358,Evening
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,3,10.758953,11.725201,27.727778,2.400821,0.136111,Evening
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,41,10.986357,14.26603,30.418827,2.690314,0.252387,Morning
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,11,10.672775,12.022613,28.010417,2.474227,0.157068,Evening
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,48,11.0,22.045977,27.741379,1.885057,0.0,Morning


In [34]:
fix_dtype_to_int = ['month', 'day', 'weekday', 'is_weekend']
user_features_df[fix_dtype_to_int] = user_features_df[fix_dtype_to_int].astype(int)

In [35]:
user_features_df.columns

Index(['user_id', 'gender', 'age', 'country', 'city', 'exp_group', 'os',
       'source', 'count_actions', 'category_of_age', 'cluster_feature',
       'month', 'day', 'second', 'weekday', 'is_weekend', 'part_of_day'],
      dtype='object')

MODEL EVALUATION

In [36]:
def prepare_data(df):

    train_df = df[df.timestamp < '2021-12-21']
    test_df = df[df.timestamp >= '2021-12-21']

    train_df = train_df.drop(['timestamp'], axis=1)
    test_df = test_df.drop(['timestamp'], axis=1)

    X_train = train_df.drop('target', axis=1).set_index(['user_id', 'post_id'])
    X_test = test_df.drop('target', axis=1).set_index(['user_id', 'post_id'])

    y_train = train_df['target']
    y_test = test_df['target']

    return X_train, y_train, X_test, y_test

In [37]:
X_train, y_train, X_test, y_test = prepare_data(train_df)

In [108]:
categorical_features = ['topic', 'month', 'day', 
                        'weekday', 'part_of_day', 
                        'country', 'city', 'exp_group', 
                        'os', 'source', 'category_of_age', 
                        'cluster_feature']

In [41]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier()
cat_model.fit(X_train, y_train, cat_features=categorical_features, verbose=False)

KeyboardInterrupt: 

In [None]:
cat_model.score(X_test, y_test)

TRAIN MODEL ON ALL DATA IN TRAIN_DF AND SAVE THE MODEL

In [None]:
X = train_df.drop(['timestamp', 'target'], axis=1).set_index(['user_id', 'post_id'])
y = train_df['target']

In [None]:
cat_model.fit(X, y, cat_features=categorical_features, verbose=False)

In [56]:
cat_model.save_model('catboost_model',
                     format="cbm")


SAVE FEATURES DATAFRAME

In [57]:
user_features_df.to_csv('kokh_user_features_df.csv', index=False)

post_features_df.to_csv('kokh_post_features_df.csv', index=False)

CREATE A FUNCTION TO JOIN TWO TABLES WITH FEATURES

In [59]:
def prediction_top_5_posts(user_feature_df, post_features_df, user_id, model):

    ## Save the place for features is important for model
    places_for_features_columns = ['user_id', 'post_id', 'gender', 'age', 'country', 'city',
                                  'exp_group', 'os', 'source', 'count_actions', 'category_of_age',
                                  'cluster_feature', 'month', 'day', 'second', 'weekday', 'is_weekend',
                                  'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 
                                  'part_of_day', 'topic']
    
    # Create copy of dataframes and find the data of this user 
    this_user_data = user_features_df.copy().loc[user_features_df['user_id'] == user_id]
    all_post_features_df = post_features_df.copy()

    # Merge dataframes on key column
    this_user_data['key'] = 1
    all_post_features_df['key'] = 1
    result = this_user_data.merge(all_post_features_df, on='key').drop('key', axis=1)
    result = result[places_for_features_columns].set_index(['user_id', 'post_id'])
    result['prediction'] = model.predict_proba(result)[:, 1]
    top_5_posts = result.sort_values('prediction', ascending=False).head(5).index.get_level_values('post_id').tolist()
    return top_5_posts

TEST RECOMMENDATIONS

In [60]:
prediction_top_5_posts(user_features_df, post_features_df, 599, cat_model)

[6990, 6435, 4235, 6358, 5985]