In [1]:
# считывает MovieLens
from os import path
import pandas as pd

data_dir = "./ml-latest-small"

def read_csv(filename: str):
    data = pd.read_csv(path.join(data_dir, filename + ".csv"))
    return data



In [12]:
movies = read_csv("movies")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
movies.describe()

Unnamed: 0,movieId
count,9125.0
mean,31123.291836
std,40782.633604
min,1.0
25%,2850.0
50%,6290.0
75%,56274.0
max,164979.0


In [8]:
ratings = read_csv("ratings")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [14]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [9]:
tags = read_csv("tags")
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [15]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,1296.0,1296.0,1296.0
mean,417.026235,42278.949846,1324337000.0
std,142.18344,44628.345568,109388600.0
min,15.0,1.0,1137217000.0
25%,346.0,2988.0,1243455000.0
50%,431.0,26958.5,1342849000.0
75%,547.0,72268.25,1440380000.0
max,663.0,164979.0,1476651000.0


In [10]:
links = read_csv("links")
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [16]:
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,9125.0,9125.0,9112.0
mean,31123.291836,479824.4,39104.545544
std,40782.633604,743177.4,62814.519801
min,1.0,417.0,2.0
25%,2850.0,88846.0,9451.75
50%,6290.0,119778.0,15852.0
75%,56274.0,428441.0,39160.5
max,164979.0,5794766.0,416437.0


In [5]:
# перекодируем id с пробелами в плотные
ratings["movie_id"] = ratings["movieId"].astype("category").cat.codes.copy()
ratings["user_id"] = ratings["userId"].astype("category").cat.codes.copy()

In [20]:
tags["movieId"].max()

NameError: name 'tag' is not defined

In [6]:
# переводим матрицу взаимодействий пользователей и объектов к разреженному виду
from scipy.sparse import csr_matrix
import numpy as np

last_movie_id = movies["movieId"].max()
last_tag_id = tag["movieId"].max()
def sparse_info(sparse_matrix: csr_matrix):
    print("Размерности матрицы: {}".format(sparse_matrix.shape))
    print("Ненулевых элементов в матрице: {}".format(sparse_matrix.nnz))
    print("Доля ненулевых элементов: {}"
          .format(sparse_matrix.nnz / sparse_matrix.shape[0] / sparse_matrix.shape[1])
    )

user_x_item = ratings[["user_id", "movie_id"]].as_matrix()
user_x_item
user_item_matrix = csr_matrix(
    (
        ratings["rating"].tolist(),
        (
            [pair[0] for pair in user_x_item],
            [pair[1] for pair in user_x_item],
        )
    ),
    shape=(last_user_id + 1, last_movie_id + 1),
    dtype=np.float32
)

sparse_info(user_item_matrix)

Размерности матрицы: (671, 9066)
Ненулевых элементов в матрице: 100004
Доля ненулевых элементов: 0.016439141608663475


In [10]:
# разобьём наблюдения на тестовую и обучающую выборки
np.random.seed(0)
test_indices = np.random.choice(
    range(user_item_matrix.nnz),
    replace=False,
    size=int(user_item_matrix.nnz * 0.2)
).tolist()
train_data = user_item_matrix.copy()
train_data.data[test_indices] = 0
train_data.eliminate_zeros()
print("размер обучающей выборки: {}".format(train_data.nnz))
test_data = user_item_matrix.copy()
test_data.data[:] = 0
test_data.data[test_indices] = user_item_matrix.data[test_indices]
test_data.eliminate_zeros()
print("размер тестовой выборки: {}".format(test_data.nnz))

размер обучающей выборки: 80004
размер тестовой выборки: 20000


In [9]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, reciprocal_rank

# обучаем абы какую модель
model = LightFM(loss="bpr")
model.fit(
    train_data,
    num_threads=4
)

train_mrr = reciprocal_rank(model, train_data).mean()
test_mrr = reciprocal_rank(model, test_data).mean()
print('MRR: train %.2f, test %.2f.' % (train_mrr, test_mrr))
for k in [5, 10, 15, 20]:
    train_precision = precision_at_k(model, train_data, k=k).mean()
    test_precision = precision_at_k(model, test_data, k=k).mean()
    print('Precision@%d: train %.2f, test %.2f.' % (k, train_precision, test_precision))

MRR: train 0.51, test 0.20.
Precision@5: train 0.34, test 0.08.
Precision@10: train 0.32, test 0.08.
Precision@15: train 0.31, test 0.07.
Precision@20: train 0.29, test 0.07.


In [11]:
# выберем информацию по жанрам фильмов
movies_genres = ratings[["movieId", "movie_id"]].drop_duplicates().join(
    movies,
    on="movieId",
    rsuffix="codes",
    lsuffix="movies",
    sort=True
).fillna("None")[["movie_id", "genres"]]
movies_genres["genres_set"] = movies_genres["genres"].apply(lambda x: set(x.split("|")))
movies_genres.head()

Unnamed: 0,movie_id,genres,genres_set
495,0,Adventure|Children|Fantasy,"{Adventure, Children, Fantasy}"
963,1,Comedy|Romance,"{Romance, Comedy}"
351,2,Comedy|Drama|Romance,"{Romance, Comedy, Drama}"
3108,3,Comedy,{Comedy}
964,4,Action|Crime|Thriller,"{Thriller, Action, Crime}"


In [12]:
# все возможные жанры
from functools import reduce

reduce(lambda acc, ele: acc.union(ele), movies_genres["genres_set"].tolist(), set())

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'None',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# приравняем None и (no genres listed)
movies_genres.loc[movies_genres["genres"] == "(no genres listed)", "genres"] = "None"
# уберём все спецсимволы, кроме |
movies_genres["genres"] = movies_genres["genres"].apply(
    lambda x: x.replace("-", "")
)

movies_features = CountVectorizer().fit_transform(movies_genres["genres"])
movies_features

<9066x20 sparse matrix of type '<class 'numpy.int64'>'
	with 15786 stored elements in Compressed Sparse Row format>

In [16]:
print(movies_genres["genres"][:1])
print(movies_features[0].todense())

495    Adventure|Children|Fantasy
Name: genres, dtype: object
[[0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]]


In [17]:
# добавляем к movie_id ещё и информацию о жанрах
from scipy.sparse import hstack, identity

features = hstack([
    identity(movies_genres.shape[0]),
    movies_features
])

In [21]:
features

<9066x9086 sparse matrix of type '<class 'numpy.float64'>'
	with 24852 stored elements in COOrdinate format>

In [22]:
# обучаем модель с жанрами
hybrid = LightFM(loss="bpr")
hybrid.fit(
    train_data,
    num_threads=4,
    item_features=features
)

train_mrr = auc_score(hybrid, train_data, item_features=features).mean()
test_mrr = auc_score(hybrid, test_data, item_features=features).mean()
print('MRR: train %.2f, test %.2f.' % (train_mrr, test_mrr))
#for k in [5, 10, 15, 20]:
    #train_precision = precision_at_k(hybrid, train_data, item_features=features, k=k).mean()
    #test_precision = precision_at_k(hybrid, test_data, item_features=features, k=k).mean()
#print('Precision@%d: train %.2f, test %.2f.' % (k, train_precision, test_precision))

NameError: name 'auc_score' is not defined