In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pprint import pprint
from datetime import datetime

### Описание и загрузка датасета

В данном файле содержится информация о взаимодействиях пользователей с контентом:
* user_id - ID пользователя
* item_id - ID контента
* last_watch_dt - Дата последнего просмотра
* total_dur - Общая продолжительность всех просмотров данного контента в секундах
* content_type - Тип контента (фильм, сериал)

In [2]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/interactions.csv', parse_dates=['last_watch_dt'], date_parser=dateparse)

In [4]:
df = df.sort_values(by=['user_id','last_watch_dt'])

In [5]:
df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
3590116,0,12192,2021-07-16,89,0.0
620,0,7102,2021-07-19,169,3.0
67070,0,14359,2021-07-19,130,2.0
90113,0,15297,2021-07-19,459,0.0
3103040,0,9728,2021-07-19,4,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5476251 entries, 3590116 to 3167566
Data columns (total 5 columns):
 #   Column         Dtype         
---  ------         -----         
 0   user_id        int64         
 1   item_id        int64         
 2   last_watch_dt  datetime64[ns]
 3   total_dur      int64         
 4   watched_pct    float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 250.7 MB


In [7]:
df.nunique()

user_id          962179
item_id           15706
last_watch_dt       163
total_dur        129788
watched_pct         101
dtype: int64

### Отбор контента (процент просмотра видео больше 10)

In [8]:
LIMIT_PERCENT_WATCH_ITEM = 10

In [9]:
df = df[df['watched_pct']>=LIMIT_PERCENT_WATCH_ITEM]

In [10]:
df.shape

(3721317, 5)

### Отбор 10 пользователей для построения рекомендаций (участвуют только пользователи, которые уже успели посмотреть более 5 видео)

In [11]:
LIMIT_COUNT_ITEMS = 5
COUNT_TEST_USERS = 10

In [12]:
df_group_list_items = df.groupby(by=['user_id'])['item_id'].apply(list).reset_index(name='list_items')

In [13]:
df_group_list_items.shape

(762727, 2)

In [14]:
df_group_list_items.head()

Unnamed: 0,user_id,list_items
0,1,"[10440, 3669]"
1,2,"[7571, 11577, 16166, 4436, 4475, 6774, 11689, ..."
2,3,"[3734, 9728, 10440, 9550, 4151, 4436, 13789, 8..."
3,4,[4700]
4,5,"[12466, 3145, 6167, 8450, 7043]"


In [15]:
df_group_list_items['len_list_items'] = df_group_list_items['list_items'].apply(lambda x: len(x))

In [16]:
df_group_list_items.head()

Unnamed: 0,user_id,list_items,len_list_items
0,1,"[10440, 3669]",2
1,2,"[7571, 11577, 16166, 4436, 4475, 6774, 11689, ...",45
2,3,"[3734, 9728, 10440, 9550, 4151, 4436, 13789, 8...",23
3,4,[4700],1
4,5,"[12466, 3145, 6167, 8450, 7043]",5


In [17]:
df_group_list_items_limit = df_group_list_items[
                      df_group_list_items['len_list_items']>=LIMIT_COUNT_ITEMS
                                                ]

In [18]:
df_group_list_items_limit.shape

(213093, 3)

In [19]:
df_10_users = df_group_list_items_limit.sample(n=COUNT_TEST_USERS)

In [20]:
df_10_users.head(10)

Unnamed: 0,user_id,list_items,len_list_items
225002,323749,"[7107, 8980, 4151, 12335, 8252, 12259, 8694, 1...",30
440307,633489,"[13865, 14526, 8636, 6809, 1244, 7793, 3734]",7
540570,777812,"[14741, 4151, 12609, 4457, 6945, 15297]",6
236016,339533,"[1926, 7547, 7959, 12132, 3734]",5
636679,915998,"[10440, 4151, 9982, 13865, 14901, 8636]",6
181546,261356,"[4151, 9679, 4946, 7571, 8346]",5
590613,849450,"[9728, 14901, 3734, 14476, 7582, 15297, 334]",7
641998,923558,"[5982, 3509, 13865, 10440, 1785, 4457, 2657]",7
621833,894673,"[15297, 211, 3888, 5434, 11985, 10323, 15266]",7
577482,830656,"[3034, 4762, 13492, 9808, 11505]",5


### Выделение списка фильмов, по которым будет происходить сопоставление (последнее видео отводится на тест)

In [21]:
df_10_users['list_items_without_last_item'] = df_10_users['list_items'].apply(lambda x: list(x)[:-1])
df_10_users['last_item'] = df_10_users['list_items'].apply(lambda x: list(x)[-1])

In [22]:
df_10_users.head(10)

Unnamed: 0,user_id,list_items,len_list_items,list_items_without_last_item,last_item
225002,323749,"[7107, 8980, 4151, 12335, 8252, 12259, 8694, 1...",30,"[7107, 8980, 4151, 12335, 8252, 12259, 8694, 1...",747
440307,633489,"[13865, 14526, 8636, 6809, 1244, 7793, 3734]",7,"[13865, 14526, 8636, 6809, 1244, 7793]",3734
540570,777812,"[14741, 4151, 12609, 4457, 6945, 15297]",6,"[14741, 4151, 12609, 4457, 6945]",15297
236016,339533,"[1926, 7547, 7959, 12132, 3734]",5,"[1926, 7547, 7959, 12132]",3734
636679,915998,"[10440, 4151, 9982, 13865, 14901, 8636]",6,"[10440, 4151, 9982, 13865, 14901]",8636
181546,261356,"[4151, 9679, 4946, 7571, 8346]",5,"[4151, 9679, 4946, 7571]",8346
590613,849450,"[9728, 14901, 3734, 14476, 7582, 15297, 334]",7,"[9728, 14901, 3734, 14476, 7582, 15297]",334
641998,923558,"[5982, 3509, 13865, 10440, 1785, 4457, 2657]",7,"[5982, 3509, 13865, 10440, 1785, 4457]",2657
621833,894673,"[15297, 211, 3888, 5434, 11985, 10323, 15266]",7,"[15297, 211, 3888, 5434, 11985, 10323]",15266
577482,830656,"[3034, 4762, 13492, 9808, 11505]",5,"[3034, 4762, 13492, 9808]",11505


### Подбор похожих пользоватей для каждого тестового юзера (по просмотренным фильмам)

In [23]:
PERCENT_INTERSECTION = 0.5

In [24]:
data_list_user_id = []
data_list_items = []
data_cluster_id = []

In [25]:
for index, row in tqdm(df_10_users[['user_id','list_items_without_last_item']].iterrows()):
  print(row['user_id'], row['list_items_without_last_item'])
  for index_, row_ in df_group_list_items_limit[['user_id','list_items']].iterrows():
    if row['user_id'] == row_['user_id']:
      continue
    else:
      counter_intersection = len(set(row_['list_items']) & set(row['list_items_without_last_item']))
      if (counter_intersection/len(row_['list_items']))>=PERCENT_INTERSECTION or \
      (counter_intersection/len(row['list_items_without_last_item']))>=PERCENT_INTERSECTION:
        data_list_user_id.append(row_['user_id'])
        data_list_items.append(row_['list_items'])
        data_cluster_id.append(row['user_id'])    

0it [00:00, ?it/s]

323749 [7107, 8980, 4151, 12335, 8252, 12259, 8694, 1785, 6155, 14741, 5287, 1287, 9728, 7731, 13865, 14362, 3734, 916, 7977, 3606, 14526, 5171, 13475, 734, 3935, 7626, 5087, 8437, 7417]
633489 [13865, 14526, 8636, 6809, 1244, 7793]
777812 [14741, 4151, 12609, 4457, 6945]
339533 [1926, 7547, 7959, 12132]
915998 [10440, 4151, 9982, 13865, 14901]
261356 [4151, 9679, 4946, 7571]
849450 [9728, 14901, 3734, 14476, 7582, 15297]
923558 [5982, 3509, 13865, 10440, 1785, 4457]
894673 [15297, 211, 3888, 5434, 11985, 10323]
830656 [3034, 4762, 13492, 9808]


In [26]:
d_similar_users = {'user_id':data_list_user_id, 'list_items':data_list_items,'cluster_id':data_cluster_id}

In [27]:
df_similar_users = pd.DataFrame(data=d_similar_users)

In [28]:
df_similar_users.head()

Unnamed: 0,user_id,list_items,cluster_id
0,166,"[9728, 15297, 8636, 13865, 4151]",323749
1,537,"[3734, 7626, 1785, 9728, 12173, 5732]",323749
2,627,"[11237, 13865, 9728, 3734, 10811]",323749
3,834,"[11778, 9728, 3734, 15297, 10440, 12259]",323749
4,975,"[7571, 3734, 1819, 13865, 10440, 9728]",323749


In [29]:
df_similar_users['cluster_id'].value_counts()

915998    9775
849450    8413
261356    4125
923558    3772
323749    3613
633489    2016
777812     480
894673     368
339533      82
830656      26
Name: cluster_id, dtype: int64

### Описание и загрузка датасета

В данном файле содержится информация о пользователях:
* user_id - ID пользователя
* age - возрастная группа пользователя, строка вида "M_N" (данный признак - результат работы модели)
> * 18_24 - от 18 до 24 лет включительно
> * 25_34 - от 25 до 34 лет включительно
> * 35_44 - от 35 до 44 лет включительно
> * 45_54 - от 45 до 54 лет включительно
> * 55_64 - от 55 до 64 лет включительно
> * 65_inf - от 65 и старше
* sex - пол пользователя (данный признак - результат работы модели)
> * М - мужчина
> * Ж - женщина
* income - доход пользователя, строка вида "M_N" (данный признак - результат работы модели)
> * income_0_20
> * income_20_40
> * income_40_60
> * income_60_90  
> * income_90_150
> * income_150_inf
* kids_flg - флаг "наличие ребенка" (данный признак - результат работы модели)

In [30]:
df_users = pd.read_csv('/content/drive/MyDrive/Datasets/users.csv')

In [31]:
df_users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [32]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   840197 non-null  int64 
 1   age       826102 non-null  object
 2   income    825421 non-null  object
 3   sex       826366 non-null  object
 4   kids_flg  840197 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 32.1+ MB


In [33]:
df_users.nunique()

user_id     840197
age              6
income           6
sex              2
kids_flg         2
dtype: int64

In [34]:
dict_gender = {'М':'m','Ж':'f'}

In [35]:
df_users['sex'] = df_users['sex'].map(dict_gender)

In [36]:
df_users.columns = ['user_id','age','income','gender','kids_flg']

In [37]:
df_users.head()

Unnamed: 0,user_id,age,income,gender,kids_flg
0,973171,age_25_34,income_60_90,m,1
1,962099,age_18_24,income_20_40,m,0
2,1047345,age_45_54,income_40_60,f,0
3,721985,age_45_54,income_20_40,f,0
4,704055,age_35_44,income_60_90,f,0


### Обогащение датафрейма с похожими пользователями дополнительной информацией

In [38]:
df_similar_users_info = pd.merge(df_similar_users, df_users, how='left', on='user_id', suffixes=[None,'_u'])

In [39]:
df_similar_users_info.head()

Unnamed: 0,user_id,list_items,cluster_id,age,income,gender,kids_flg
0,166,"[9728, 15297, 8636, 13865, 4151]",323749,age_35_44,income_20_40,m,0.0
1,537,"[3734, 7626, 1785, 9728, 12173, 5732]",323749,age_45_54,income_20_40,f,1.0
2,627,"[11237, 13865, 9728, 3734, 10811]",323749,age_35_44,income_20_40,f,1.0
3,834,"[11778, 9728, 3734, 15297, 10440, 12259]",323749,,,,
4,975,"[7571, 3734, 1819, 13865, 10440, 9728]",323749,age_25_34,income_40_60,f,1.0


In [40]:
df_similar_users_info = pd.merge(df_similar_users_info, df_users, 
                                 how='left', 
                                 left_on='cluster_id', 
                                 right_on='user_id', 
                                 suffixes=['_u','_c'])

In [41]:
df_similar_users_info.head()

Unnamed: 0,user_id_u,list_items,cluster_id,age_u,income_u,gender_u,kids_flg_u,user_id_c,age_c,income_c,gender_c,kids_flg_c
0,166,"[9728, 15297, 8636, 13865, 4151]",323749,age_35_44,income_20_40,m,0.0,323749.0,age_35_44,income_60_90,m,1.0
1,537,"[3734, 7626, 1785, 9728, 12173, 5732]",323749,age_45_54,income_20_40,f,1.0,323749.0,age_35_44,income_60_90,m,1.0
2,627,"[11237, 13865, 9728, 3734, 10811]",323749,age_35_44,income_20_40,f,1.0,323749.0,age_35_44,income_60_90,m,1.0
3,834,"[11778, 9728, 3734, 15297, 10440, 12259]",323749,,,,,323749.0,age_35_44,income_60_90,m,1.0
4,975,"[7571, 3734, 1819, 13865, 10440, 9728]",323749,age_25_34,income_40_60,f,1.0,323749.0,age_35_44,income_60_90,m,1.0


### Подбор похожих пользоватей для каждого тестового юзера (по доп. информации)

In [42]:
def test_parameters(row):
  """Функция проверяет степень соответствия параметров подобранного пользователя
  параметрам, указанного кластера (тестовый юзер)"""
  # Проверяем только строки, в которых заданы все значения
  # Для прохождения проверки необходимо набрать минимум 2 балла соответствия
  is_test = 1
  is_check = (not pd.isnull(row['age_u'])) + \
            (not pd.isnull(row['income_u'])) + \
            (not pd.isnull(row['gender_u'])) + \
            (not pd.isnull(row['age_c'])) + \
            (not pd.isnull(row['income_c'])) + \
            (not pd.isnull(row['gender_c']))
  if is_check == 6:
    value_comparison_age = (row['age_u'] == row['age_c'])
    value_comparison_income = (row['income_u'] == row['income_c'])
    value_comparison_gender = (row['gender_u'] == row['gender_c'])
    val_comparison = value_comparison_age + value_comparison_income + \
                                            value_comparison_gender
    if val_comparison>=2:
      is_test = 1
    else:
      is_test = 0
  return is_test

In [43]:
df_similar_users_info['test_parametres'] = df_similar_users_info.apply(test_parameters, axis=1)

In [44]:
df_similar_users_info.head()

Unnamed: 0,user_id_u,list_items,cluster_id,age_u,income_u,gender_u,kids_flg_u,user_id_c,age_c,income_c,gender_c,kids_flg_c,test_parametres
0,166,"[9728, 15297, 8636, 13865, 4151]",323749,age_35_44,income_20_40,m,0.0,323749.0,age_35_44,income_60_90,m,1.0,1
1,537,"[3734, 7626, 1785, 9728, 12173, 5732]",323749,age_45_54,income_20_40,f,1.0,323749.0,age_35_44,income_60_90,m,1.0,0
2,627,"[11237, 13865, 9728, 3734, 10811]",323749,age_35_44,income_20_40,f,1.0,323749.0,age_35_44,income_60_90,m,1.0,0
3,834,"[11778, 9728, 3734, 15297, 10440, 12259]",323749,,,,,323749.0,age_35_44,income_60_90,m,1.0,1
4,975,"[7571, 3734, 1819, 13865, 10440, 9728]",323749,age_25_34,income_40_60,f,1.0,323749.0,age_35_44,income_60_90,m,1.0,0


In [45]:
df_similar_users_info.shape

(32670, 13)

In [46]:
df_similar_users_info = df_similar_users_info[df_similar_users_info['test_parametres']==1]

In [47]:
df_similar_users_info.shape

(24767, 13)

In [48]:
df_similar_users_ = df_similar_users_info[['user_id_u','list_items','cluster_id']]

In [49]:
df_similar_users_.columns = ['user_id','list_items','cluster_id']

In [50]:
df_similar_users_.head()

Unnamed: 0,user_id,list_items,cluster_id
0,166,"[9728, 15297, 8636, 13865, 4151]",323749
3,834,"[11778, 9728, 3734, 15297, 10440, 12259]",323749
6,1780,"[7107, 7417, 1844, 14741, 4457, 9728, 15297]",323749
7,2024,"[15297, 4880, 7417, 9728, 2657, 3734]",323749
9,2182,"[4151, 7107, 4718, 10440, 3734]",323749


### Ранжирование фильмов в кластерах

In [51]:
df_similar_users_rank = df_similar_users_.explode(['list_items'])

In [52]:
df_similar_users_rank = df_similar_users_rank.rename(columns={"list_items": "item_id"})

In [53]:
df_similar_users_rank.head()

Unnamed: 0,user_id,item_id,cluster_id
0,166,9728,323749
0,166,15297,323749
0,166,8636,323749
0,166,13865,323749
0,166,4151,323749


In [54]:
df_group_similar_users_rank = df_similar_users_rank.groupby(by=['cluster_id','item_id'],as_index=False).size()
df_group_similar_users_rank.columns = ['cluster_id','item_id','rank']
df_group_similar_users_rank = df_group_similar_users_rank.sort_values(by=['cluster_id','rank'],ascending=[True,False])

In [55]:
df_group_similar_users_rank.head()

Unnamed: 0,cluster_id,item_id,rank
1460,261356,4151,3925
2631,261356,7571,3374
5372,261356,15297,2453
3630,261356,10440,1907
1311,261356,3734,1688


### Удаление из кластеров уже просмотренных фильмов, которые не должны попасть в рекомендации

In [56]:
df_10_users.head(10)

Unnamed: 0,user_id,list_items,len_list_items,list_items_without_last_item,last_item
225002,323749,"[7107, 8980, 4151, 12335, 8252, 12259, 8694, 1...",30,"[7107, 8980, 4151, 12335, 8252, 12259, 8694, 1...",747
440307,633489,"[13865, 14526, 8636, 6809, 1244, 7793, 3734]",7,"[13865, 14526, 8636, 6809, 1244, 7793]",3734
540570,777812,"[14741, 4151, 12609, 4457, 6945, 15297]",6,"[14741, 4151, 12609, 4457, 6945]",15297
236016,339533,"[1926, 7547, 7959, 12132, 3734]",5,"[1926, 7547, 7959, 12132]",3734
636679,915998,"[10440, 4151, 9982, 13865, 14901, 8636]",6,"[10440, 4151, 9982, 13865, 14901]",8636
181546,261356,"[4151, 9679, 4946, 7571, 8346]",5,"[4151, 9679, 4946, 7571]",8346
590613,849450,"[9728, 14901, 3734, 14476, 7582, 15297, 334]",7,"[9728, 14901, 3734, 14476, 7582, 15297]",334
641998,923558,"[5982, 3509, 13865, 10440, 1785, 4457, 2657]",7,"[5982, 3509, 13865, 10440, 1785, 4457]",2657
621833,894673,"[15297, 211, 3888, 5434, 11985, 10323, 15266]",7,"[15297, 211, 3888, 5434, 11985, 10323]",15266
577482,830656,"[3034, 4762, 13492, 9808, 11505]",5,"[3034, 4762, 13492, 9808]",11505


In [57]:
df_result = pd.merge(df_group_similar_users_rank, df_10_users, how='left', left_on='cluster_id', right_on='user_id')

In [58]:
df_result = df_result[['cluster_id','item_id','rank','list_items_without_last_item','last_item']]

In [59]:
df_result = df_result.rename(columns={"list_items_without_last_item": "list_items_already_viewed",
                                      "item_id": "item_id_rec",
                                      "last_item":"item_y"})

In [60]:
df_result.head()

Unnamed: 0,cluster_id,item_id_rec,rank,list_items_already_viewed,item_y
0,261356,4151,3925,"[4151, 9679, 4946, 7571]",8346
1,261356,7571,3374,"[4151, 9679, 4946, 7571]",8346
2,261356,15297,2453,"[4151, 9679, 4946, 7571]",8346
3,261356,10440,1907,"[4151, 9679, 4946, 7571]",8346
4,261356,3734,1688,"[4151, 9679, 4946, 7571]",8346


In [61]:
def test_item_already_viewed(row)->bool:
  """Функция проверяет просматривался ли текущий фильм 
  тестовым пользователем или нет"""
  if row['item_id_rec'] in row['list_items_already_viewed']:
    return True
  else:
    return False

In [62]:
df_result['viewed'] = df_result.apply(test_item_already_viewed, axis=1)

In [63]:
df_result = df_result[df_result['viewed']==False]

In [64]:
df_result.head()

Unnamed: 0,cluster_id,item_id_rec,rank,list_items_already_viewed,item_y,viewed
2,261356,15297,2453,"[4151, 9679, 4946, 7571]",8346,False
3,261356,10440,1907,"[4151, 9679, 4946, 7571]",8346,False
4,261356,3734,1688,"[4151, 9679, 4946, 7571]",8346,False
5,261356,13865,1524,"[4151, 9679, 4946, 7571]",8346,False
6,261356,9728,1369,"[4151, 9679, 4946, 7571]",8346,False


### Отбор N самых рейтинговых строк по каждому кластеру для проверки работы системы рекомендаций

In [65]:
TOP_N_RECOMMENDATIONS = 7

In [66]:
df_report = df_result.groupby('cluster_id')[['cluster_id','item_id_rec','rank','item_y']]. \
                                                              head(TOP_N_RECOMMENDATIONS). \
                                                              reset_index(drop=True)

In [67]:
df_report['is_match'] = (df_report['item_id_rec'] == df_report['item_y'])

In [68]:
df_report.head()

Unnamed: 0,cluster_id,item_id_rec,rank,item_y,is_match
0,261356,15297,2453,8346,False
1,261356,10440,1907,8346,False
2,261356,3734,1688,8346,False
3,261356,13865,1524,8346,False
4,261356,9728,1369,8346,False


In [69]:
df_report['is_match'].value_counts()

False    64
True      6
Name: is_match, dtype: int64