# Библиотека LightFM

### Ноутбук подготовлен на основе материала к вебинару "Обзор библиотеки LightFM" (youtube-канал компании Mindset)

In [1]:
%%capture
%%bash
pip install lightfm

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

### Тестовый пример

In [3]:
data = fetch_movielens(min_rating=5.0)

In [4]:
for k,v in data.items():
  print(k)

train
test
item_features
item_feature_labels
item_labels


In [5]:
print(data)

{'train': <943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 19048 stored elements in COOrdinate format>, 'test': <943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 2153 stored elements in COOrdinate format>, 'item_features': <1682x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 1682 stored elements in Compressed Sparse Row format>, 'item_feature_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
       'Sliding Doors (1998)', 'You So Crazy (1994)',
       'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object), 'item_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
       'Sliding Doors (1998)', 'You So Crazy (1994)',
       'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)}


In [6]:
model = LightFM(loss = 'warp')
model.fit(data['train'], epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fb43e25b710>

In [7]:
#Рекомендательная функция
def sample_recommendation(model, data, user_ids):
    #Число пользователей и фильмов в обучающем наборе
    n_users, n_items = data['train'].shape
    for user_id in user_ids:
    	#Фильмы, которые уже понравились пользователям
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
        #Предсказание фильмов, которые им могут понравяться
        scores = model.predict(user_id, np.arange(n_items))
        #Сортировка результата по оценке
        top_items = data['item_labels'][np.argsort(-scores)]
        #Отображение результатов
        print("User %s" % user_id)
        print("   Known positives:")

        for x in known_positives[:3]:
            print("%s" % x)

        print("   Recommended:")

        for x in top_items[:3]:
            print("%s" % x)

In [8]:
sample_recommendation(model, data, [3, 25, 451])

User 3
   Known positives:
Contact (1997)
Air Force One (1997)
In & Out (1997)
   Recommended:
Chasing Amy (1997)
Good Will Hunting (1997)
Kiss the Girls (1997)
User 25
   Known positives:
Fargo (1996)
Godfather, The (1972)
L.A. Confidential (1997)
   Recommended:
Fargo (1996)
Star Wars (1977)
Godfather, The (1972)
User 451
   Known positives:
Twelve Monkeys (1995)
Braveheart (1995)
Bad Boys (1995)
   Recommended:
Star Wars (1977)
Fargo (1996)
Princess Bride, The (1987)


### Демонстрационный пример

In [9]:
movie_metadata = pd.read_csv('datasets/movies_metadata.csv', low_memory=False)[['id','original_title','overview','genres']].set_index('original_title').dropna()

In [10]:
movie_metadata.head(5)

Unnamed: 0_level_0,id,overview,genres
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story,862,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
Jumanji,8844,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
Grumpier Old Men,15602,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
Waiting to Exhale,31357,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
Father of the Bride Part II,11862,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]"


In [11]:
rating_dataset = pd.read_csv('datasets/ratings.csv', low_memory=False)[['userId','movieId','rating']].dropna()

In [12]:
rating_dataset.head(5)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [13]:
# Убираем фильмы и пользователей с малым количеством отзывов
filter_movies = (rating_dataset['movieId'].value_counts()>100)
filter_movies = filter_movies[filter_movies].index.tolist()

filter_users = (rating_dataset['userId'].value_counts()>200)
filter_users = filter_users[filter_users].index.tolist()

rating_dataset_filtered = rating_dataset[(rating_dataset['movieId'].isin(filter_movies)) & (rating_dataset['userId'].isin(filter_users))]
del filter_movies, filter_users
print('Shape User-Ratings unfiltered:\t{}'.format(rating_dataset.shape))
print('Shape User-Ratings filtered:\t{}'.format(rating_dataset_filtered.shape))

Shape User-Ratings unfiltered:	(100004, 3)
Shape User-Ratings filtered:	(10549, 3)


In [14]:
rating_dataset_filtered.head(5)

Unnamed: 0,userId,movieId,rating
147,4,10,4.0
148,4,34,5.0
151,4,153,4.0
153,4,185,3.0
154,4,260,5.0


In [15]:
rating_dataset_filtered_shuffled = rating_dataset_filtered.sample(frac=1).reset_index(drop=True)
rating_dataset_filtered_shuffled.head(5)

Unnamed: 0,userId,movieId,rating
0,23,316,3.5
1,88,2329,2.5
2,665,2628,3.0
3,452,1721,1.0
4,615,608,4.0


In [16]:
n = 1500
rating_dataset_train = rating_dataset_filtered_shuffled[:-n]
rating_dataset_test = rating_dataset_filtered_shuffled[-n:]

In [17]:
rating_dataset_train.head(5)

Unnamed: 0,userId,movieId,rating
0,23,316,3.5
1,88,2329,2.5
2,665,2628,3.0
3,452,1721,1.0
4,615,608,4.0


In [18]:
#Создадим User-Movie-matrix
user_movie_matrix = rating_dataset_train.pivot_table(index='userId', columns='movieId', values='rating')
print('Shape User-Movie-Matrix:\t{}'.format(user_movie_matrix.shape))
user_movie_matrix.sample(3)

Shape User-Movie-Matrix:	(137, 149)


movieId,1,2,6,10,25,32,34,36,39,47,50,110,111,150,153,165,185,208,223,231,253,260,288,292,293,296,316,318,329,339,344,349,356,357,364,367,377,380,434,454,...,2571,2628,2683,2706,2716,2762,2791,2797,2858,2916,2918,2959,2987,2997,3114,3147,3578,3793,3996,4226,4306,4886,4896,4963,4973,4993,4995,5349,5445,5952,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
544,,,4.5,,,,,,,5.0,,4.0,4.5,5.0,,,,,5.0,,,4.0,,,4.5,4.5,,,,,,,4.0,,,,,,,,...,4.0,,,,4.5,,,,4.0,,5.0,,,,,,4.0,,,5.0,,,,4.5,3.5,,,,,4.0,,,4.5,4.0,,4.5,,4.5,4.5,4.0
501,5.0,,,,,,,4.5,,5.0,3.5,,0.5,,,,,,,,,,,,,5.0,,5.0,,,,,,,3.5,3.0,,,,,...,4.0,,,,,4.0,,,,,,5.0,,3.5,5.0,,4.5,,,4.5,4.5,4.0,4.0,4.5,4.0,4.5,3.5,,,4.0,4.0,,,4.5,5.0,,4.5,,3.5,
587,,,,,4.0,,3.5,,,3.5,4.5,,4.0,4.5,,,,,,0.5,,,,,,3.5,,4.5,,,,,,,,,3.5,,,,...,3.5,,,,,4.0,3.5,,4.5,,,,,4.0,,,,,5.0,3.5,4.0,,,,3.5,,3.5,2.5,,4.0,,3.5,,,4.0,,,,,


In [19]:
#Создадим маппинг для пользователей и фильмов
user_id_mapping = {id:i for i, id in enumerate(rating_dataset_filtered['userId'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(rating_dataset_filtered['movieId'].unique())}

In [20]:
#Применим его к обучающему и тренировочному набору
train_user_data = rating_dataset_train['userId'].map(user_id_mapping)
train_movie_data = rating_dataset_train['movieId'].map(movie_id_mapping)
test_user_data = rating_dataset_test['userId'].map(user_id_mapping)
test_movie_data = rating_dataset_test['movieId'].map(movie_id_mapping)

In [21]:
train_user_data.head(5)

0      5
1     13
2    136
3     89
4    130
Name: userId, dtype: int64

In [22]:
#Создадим разреженную матрицу рейтинга
shape = (len(user_id_mapping), len(movie_id_mapping))
train_matrix = coo_matrix((rating_dataset_train['rating'].values, (train_user_data.astype(int), train_movie_data.astype(int))), shape=shape)
test_matrix = coo_matrix((rating_dataset_test['rating'].values, (test_user_data.astype(int), test_movie_data.astype(int))), shape=shape)

In [23]:
#Создадим модель LightFM и обучим ее
model = LightFM(loss='warp')
model.fit(train_matrix, epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fb434e97d90>

In [24]:
k = 20
print('Train precision at k={}:\t{:.4f}'.format(k, precision_at_k(model, train_matrix, k=k).mean()))
print('Test precision at k={}:\t\t{:.4f}'.format(k, precision_at_k(model, test_matrix, k=k).mean()))

Train precision at k=20:	0.8573
Test precision at k=20:		0.0372
