In [107]:
!pip install -r local.requirements.txt 2>/dev/null



In [119]:
from pprint import pprint
import copy

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics, NDCG
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter

In [109]:
%%time
!wget -q https://files.grouplens.org/datasets/movielens/ml-1m.zip -O ml-1m.zip
!unzip -o ml-1m.zip
!rm ml-1m.zip

Archive:  ml-1m.zip
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         
CPU times: user 88 ms, sys: 20.3 ms, total: 108 ms
Wall time: 5.37 s


In [110]:
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",
    header=None,
    names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
)

ratings["datetime"] = pd.to_datetime(ratings["datetime"] * 10 ** 9)
print("Time period")
ratings["datetime"].min(), ratings["datetime"].max()

movies = pd.read_csv(
    "ml-1m/movies.dat", 
    sep="::",
    engine="python",
    header=None,
    names=[Columns.Item, "title", "genres"],
    encoding_errors="ignore",
)

ratings.head()

Time period


Unnamed: 0,user_id,item_id,weight,datetime
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [111]:
interactions = Interactions(ratings)
del ratings
interactions.df.head()

Unnamed: 0,user_id,item_id,weight,datetime
0,1,1193,5.0,2000-12-31 22:12:40
1,1,661,3.0,2000-12-31 22:35:09
2,1,914,3.0,2000-12-31 22:32:48
3,1,3408,4.0,2000-12-31 22:04:35
4,1,2355,5.0,2001-01-06 23:38:11


In [112]:
RANDOM_STATE=32
K_RECOS=10
N_SPLITS=3

In [113]:
n_splits = N_SPLITS

splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

splitter.get_test_fold_borders(interactions)


[(Timestamp('2003-01-18 00:00:00', freq='14D'),
  Timestamp('2003-02-01 00:00:00', freq='14D')),
 (Timestamp('2003-02-01 00:00:00', freq='14D'),
  Timestamp('2003-02-15 00:00:00', freq='14D')),
 (Timestamp('2003-02-15 00:00:00', freq='14D'),
  Timestamp('2003-03-01 00:00:00', freq='14D'))]

In [114]:
selected_models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
    "tfidf_k=5": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5)),
    "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10)),
    "bm25_k=10_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.05, B=0.1)),
}

metrics = {
    "miuf-1": MeanInvUserFreq(k=1),
    "miuf-5": MeanInvUserFreq(k=5),
    "miuf-10": MeanInvUserFreq(k=10),
    "ndcg-1": NDCG(k=1),
    "ndcg-5": NDCG(k=5),
    "ndcg-10": NDCG(k=10),
    "recall-1": Recall(k=1),
    "recall-5": Recall(k=5),
    "recall-10": Recall(k=10),
    "map-1": MAP(k=1),
    "map-5": MAP(k=5),
    "map-10": MAP(k=10),
    "serendipity-1": Serendipity(k=1),
    "serendipity-5": Serendipity(k=5),
    "serendipity-10": Serendipity(k=10),
    "precision-1": Precision(k=1),
    "precision-5": Precision(k=5),
    "precision-10": Precision(k=10),
}


In [115]:
def compute_metrics(models, metrics, splitter, top_k):
    results = []
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=N_SPLITS):
        print(f"\n==================== Fold {fold_info['i_split']}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            current_model = copy.deepcopy(model)

            current_model.fit(dataset)
            recos = current_model.recommend(
                users=test_users,
                dataset=dataset,
                k=top_k,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)

    return results

In [116]:
compute_metrics(selected_models, metrics, splitter, K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2003-02-01 00:00:00', freq='14D'),
 'i_split': 0,
 'start': Timestamp('2003-01-18 00:00:00', freq='14D'),
 'test': 630,
 'test_items': 540,
 'test_users': 75,
 'train': 998083,
 'train_items': 3706,
 'train_users': 6040}


 33%|███▎      | 1/3 [00:04<00:08,  4.50s/it]


{'end': Timestamp('2003-02-15 00:00:00', freq='14D'),
 'i_split': 1,
 'start': Timestamp('2003-02-01 00:00:00', freq='14D'),
 'test': 899,
 'test_items': 704,
 'test_users': 57,
 'train': 998713,
 'train_items': 3706,
 'train_users': 6040}


 67%|██████▋   | 2/3 [00:07<00:03,  3.86s/it]


{'end': Timestamp('2003-03-01 00:00:00', freq='14D'),
 'i_split': 2,
 'start': Timestamp('2003-02-15 00:00:00', freq='14D'),
 'test': 597,
 'test_items': 501,
 'test_users': 66,
 'train': 999612,
 'train_items': 3706,
 'train_users': 6040}


100%|██████████| 3/3 [00:11<00:00,  3.88s/it]


[{'fold': 0,
  'model': 'random',
  'recall-1': 0.0,
  'precision-1': 0.0,
  'recall-5': 0.0,
  'precision-5': 0.0,
  'recall-10': 0.0,
  'precision-10': 0.0,
  'ndcg-1': 0.0,
  'ndcg-5': 0.0,
  'ndcg-10': 0.0,
  'map-1': 0.0,
  'map-5': 0.0,
  'map-10': 0.0,
  'miuf-1': 6.792791444243799,
  'miuf-5': 6.452971462924063,
  'miuf-10': 6.5396220635395075,
  'serendipity-1': 0.0,
  'serendipity-5': 0.0,
  'serendipity-10': 0.0},
 {'fold': 0,
  'model': 'popular',
  'recall-1': 0.023333333333333334,
  'precision-1': 0.05333333333333334,
  'recall-5': 0.03003439887621201,
  'precision-5': 0.026666666666666672,
  'recall-10': 0.03740973721177236,
  'precision-10': 0.023999999999999997,
  'ndcg-1': 0.05333333333333334,
  'ndcg-5': 0.02988932534152588,
  'ndcg-10': 0.027093409440312886,
  'map-1': 0.023333333333333334,
  'map-5': 0.024995873162709424,
  'map-10': 0.02644635258358663,
  'miuf-1': 1.2287521179461294,
  'miuf-5': 1.4426186132151129,
  'miuf-10': 1.5807359233879539,
  'serendipity-

### Визуализируем

In [117]:
THRESHOLD=4

def render_stats(model, dataset, user_ids, item_data):
    recommendations = model.recommend(
        k=K_RECOS,
        filter_viewed=True,
        dataset=dataset,
        users=user_ids,
    )

    ### Считаем взаимодействия с фильмами
    dataset.interactions.df["interactions"] = dataset.interactions.df.groupby('item_id')['item_id'].transform('count')
    
    views = dataset.interactions.df[dataset.interactions.df.user_id.isin(
        dataset.user_id_map.convert_to_internal(user_ids))].merge(item_data, on="item_id")
    views.user_id = dataset.user_id_map.convert_to_external(views.user_id)

    recommendations.item_id = dataset.item_id_map.convert_to_internal(recommendations.item_id.values)
    recommendations = recommendations.merge(dataset.interactions.df[["item_id", "interactions"]].drop_duplicates(), on="item_id")
    recommendations.item_id = dataset.item_id_map.convert_to_external(recommendations.item_id.values)

    recos_for_user = recommendations.merge(item_data, on="item_id").drop(columns=["score"])

    for user_id in user_ids:
        print('ID', user_id)
        print('Рекомендации для пользователя:')
        display(recos_for_user[recos_for_user.user_id == user_id].drop("user_id", axis=1))

        print("Просмотренные пользователем фильмы:")
        display(views[(views.user_id == user_id) & (views.weight >= THRESHOLD)].tail(K_RECOS))

In [118]:
dataset = Dataset.construct(interactions.df)
model = PopularModel()
model.fit(dataset)
user_ids = [1, 2, 3]
render_stats(model, dataset, user_ids, movies)

ID 1
Рекомендации для пользователя:


Unnamed: 0,item_id,rank,interactions,title,genres
0,2858,1,3428,American Beauty (1999),Comedy|Drama
1,1196,2,2990,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
2,1210,3,2883,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
3,480,4,2672,Jurassic Park (1993),Action|Adventure|Sci-Fi
4,589,5,2649,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
6,2571,6,2590,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8,593,7,2578,"Silence of the Lambs, The (1991)",Drama|Thriller
9,1580,8,2538,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
11,1198,9,2514,Raiders of the Lost Ark (1981),Action|Adventure
12,110,10,2443,Braveheart (1995),Action|Drama|War


Просмотренные пользователем фильмы:


Unnamed: 0,user_id,item_id,weight,datetime,interactions,title,genres
49,1,43,4.0,2000-12-31 22:26:10,1072,Restoration (1995),Drama
50,1,44,4.0,2000-12-31 22:12:40,2991,Mortal Kombat (1995),Action|Adventure
52,1,45,5.0,2000-12-31 22:29:37,1011,To Die For (1995),Comedy|Drama
53,1,46,5.0,2000-12-31 22:36:45,568,How to Make an American Quilt (1995),Drama|Romance
54,1,47,4.0,2000-12-31 22:11:59,928,Seven (Se7en) (1995),Crime|Thriller
56,1,48,5.0,2000-12-31 22:26:59,2653,Pocahontas (1995),Animation|Children's|Musical|Romance
58,1,49,4.0,2000-12-31 22:35:49,338,When Night Is Falling (1995),Drama|Romance
59,1,50,4.0,2000-12-31 22:36:14,1585,"Usual Suspects, The (1995)",Crime|Thriller
61,1,51,4.0,2000-12-31 22:23:18,2513,Guardian Angel (1994),Action|Drama|Thriller
62,1,52,4.0,2000-12-31 22:34:51,855,Mighty Aphrodite (1995),Comedy


ID 2
Рекомендации для пользователя:


Unnamed: 0,item_id,rank,interactions,title,genres
10,1580,3,2538,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
14,260,1,2991,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
15,1270,2,2583,Back to the Future (1985),Comedy|Sci-Fi
16,608,4,2513,Fargo (1996),Crime|Drama|Thriller
18,2762,5,2459,"Sixth Sense, The (1999)",Thriller
20,1197,6,2318,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
21,527,7,2304,Schindler's List (1993),Drama|War
23,1617,8,2288,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
25,1097,9,2269,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
27,2997,10,2241,Being John Malkovich (1999),Comedy


Просмотренные пользователем фильмы:


Unnamed: 0,user_id,item_id,weight,datetime,interactions,title,genres
175,2,155,4.0,2000-12-31 21:57:19,1377,Beyond Rangoon (1995),Drama|War
176,2,156,4.0,2000-12-31 21:56:49,1417,Blue in the Face (1995),Comedy
178,2,158,4.0,2000-12-31 21:42:04,350,Casper (1995),Adventure|Children's
179,2,159,4.0,2000-12-31 21:35:17,521,Clockers (1995),Drama
183,2,163,5.0,2000-12-31 22:01:40,802,Desperado (1995),Action|Romance|Thriller
184,2,164,5.0,2000-12-31 21:40:41,1424,Devil in a Blue Dress (1995),Crime|Film-Noir|Mystery|Thriller
185,2,165,4.0,2000-12-31 21:32:52,160,Die Hard: With a Vengeance (1995),Action|Thriller
188,2,167,5.0,2000-12-31 21:33:33,2227,Feast of July (1995),Drama
193,2,171,5.0,2000-12-31 21:54:46,2194,Jeffrey (1995),Comedy
196,2,174,5.0,2000-12-31 21:37:32,1261,Jury Duty (1995),Comedy


ID 3
Рекомендации для пользователя:


Unnamed: 0,item_id,rank,interactions,title,genres
5,589,2,2649,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
7,2571,3,2590,"Matrix, The (1999)",Action|Sci-Fi|Thriller
13,110,6,2443,Braveheart (1995),Action|Drama|War
17,608,4,2513,Fargo (1996),Crime|Drama|Thriller
19,2762,5,2459,"Sixth Sense, The (1999)",Thriller
22,527,8,2304,Schindler's List (1993),Drama|War
24,1617,9,2288,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
26,1097,10,2269,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
28,2028,1,2653,Saving Private Ryan (1998),Action|Drama|War
29,2396,7,2369,Shakespeare in Love (1998),Comedy|Romance


Просмотренные пользователем фильмы:


Unnamed: 0,user_id,item_id,weight,datetime,interactions,title,genres
219,3,197,4.0,2000-12-31 21:16:59,369,"Stars Fell on Henrietta, The (1995)",Drama
220,3,198,5.0,2000-12-31 21:34:19,967,Strange Days (1995),Action|Crime|Sci-Fi
221,3,199,5.0,2000-12-31 21:29:26,1419,"Umbrellas of Cherbourg, The (Parapluies de Che...",Drama|Musical
222,3,200,4.0,2000-12-31 21:22:57,912,"Tie That Binds, The (1995)",Thriller
223,3,201,4.0,2000-12-31 21:19:30,485,Three Wishes (1995),Drama
225,3,203,5.0,2000-12-31 21:16:59,1119,"To Wong Foo, Thanks for Everything! Julie Newm...",Comedy
226,3,204,4.0,2000-12-31 21:18:59,729,Under Siege 2: Dark Territory (1995),Action
227,3,205,4.0,2000-12-31 21:22:57,1127,Unstrung Heroes (1995),Comedy|Drama
228,3,206,5.0,2000-12-31 21:27:59,1599,Unzipped (1995),Documentary
229,3,207,4.0,2000-12-31 21:35:04,1035,"Walk in the Clouds, A (1995)",Drama|Romance
