In [24]:
import pandas as pd

movies = pd.read_csv("/data/ml-latest/movies.csv")
tags = pd.read_csv("/data/ml-latest/tags.csv")

In [25]:
# Создаем объект с жанрами и тэгами

rows = []
for _, row in movies.iterrows():
    for genre in row["genres"].split("|"):
        rows.append([genre, row["movieId"]])

tags.dropna(inplace=True)
tags = tags[['movieId','tag']]
tags.drop_duplicates(inplace=True)
tags = tags.rename (columns={'tag':'genre'})


movies_genres = pd.DataFrame(rows, columns=["genre", "movieId"])
movies_genres_tags = pd.concat ([movies_genres,tags])
movies_genres_tags["genre_id"] = movies_genres_tags["genre"].astype("category").cat.codes.copy()
movies_genres_tags.head()

Unnamed: 0,genre,movieId,genre_id
0,Adventure,1,1164
1,Animation,1,1969
2,Children,1,4515
3,Comedy,1,5071
4,Fantasy,1,7971


In [3]:
# Объект без тэгов
rows = []
for _, row in movies.iterrows():
    for genre in row["genres"].split("|"):
        rows.append([genre, row["movieId"]])

movies_genres = pd.DataFrame(rows, columns=["genre", "movieId"])
movies_genres["genre_id"] = movies_genres["genre"].astype("category").cat.codes.copy()
movies_genres.head()

Unnamed: 0,genre,movieId,genre_id
0,Adventure,1,2
1,Animation,1,3
2,Children,1,4
3,Comedy,1,5
4,Fantasy,1,9


In [4]:
ratings = pd.read_csv("/data/ml-latest/ratings.csv")

In [5]:
from scipy.sparse import coo_matrix
import numpy as np

user_item_matrix = coo_matrix(
    ((ratings["rating"] >= 4).astype(np.float32),
    (ratings["userId"], ratings["movieId"])),
    shape=(
        ratings["userId"].unique().max() + 1,
        movies["movieId"].unique().max() + 1
    )
)
user_item_matrix.eliminate_zeros()

In [6]:
import numpy as np

# делим разреженную матрицу на обучающую и тестовую
total_len = user_item_matrix.data.size
train_len = int(total_len * 0.8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [7]:
from scipy.sparse import coo_matrix

def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ),
        arr.shape
    )

In [8]:
train = get_masked(user_item_matrix, train_mask)
test = get_masked(user_item_matrix, ~train_mask)

In [9]:
from scipy.sparse import save_npz

save_npz("/data/other/lightfm_train.npz", train)
save_npz("/data/other/lightfm_test.npz", test)

In [2]:
from scipy.sparse import load_npz

train = load_npz("/data/other/lightfm_train.npz")
test = load_npz("/data/other/lightfm_test.npz")

In [1]:
from lightfm import LightFM

fm = LightFM()

# попробуем без фичей.

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=4,
    verbose=True
)

Epoch 0


In [12]:
%%time
from lightfm.evaluation import reciprocal_rank

rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=4
)

CPU times: user 46min 2s, sys: 15.2 s, total: 46min 17s
Wall time: 11min 39s


In [None]:
# Получаем ошибку

In [13]:
rr.mean()

0.22554931

# Пробуем только с жанрами

In [10]:
from scipy.sparse import identity, hstack

item_feature_matrix = hstack([
    coo_matrix(
        (np.ones(movies_genres.count()[0], dtype=np.float32),
        (movies_genres["movieId"], movies_genres["genre_id"])),
        shape=(user_item_matrix.shape[1], movies_genres["genre_id"].unique().size)
    ),
    identity(user_item_matrix.shape[1])
])
item_feature_matrix.shape

(176280, 176300)

In [27]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    item_features=item_feature_matrix,
    num_threads=4,
    verbose=True
)

NameError: name 'fm' is not defined

In [25]:
%%time
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    item_features=item_feature_matrix,
    num_threads=4
)

CPU times: user 1h 6min 23s, sys: 56.5 s, total: 1h 7min 20s
Wall time: 17min


## получаем ошибку

In [26]:
rr.mean()

0.0030269846

# пробуем с жанрами и тэгами

In [28]:
from scipy.sparse import identity, hstack

item_feature_tags_matrix = hstack([
    coo_matrix(
        (np.ones(movies_genres_tags.count()[0], dtype=np.float32),
        (movies_genres_tags["movieId"], movies_genres_tags["genre_id"])),
        shape=(user_item_matrix.shape[1], movies_genres_tags["genre_id"].unique().size)
    ),
    identity(user_item_matrix.shape[1])
])
item_feature_tags_matrix.shape

(176280, 229789)

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    item_features=item_feature_tags_matrix,
    num_threads=4,
    verbose=True
)

In [None]:
%%time
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    item_features=item_feature_matrix,
    num_threads=4
)

# получаем ошибку

In [24]:
rr.mean()

NameError: name 'rr' is not defined