In [1]:
import pandas as pd 
import numpy as np
import joblib
from ast import literal_eval

In [None]:
DUMP = True

In [2]:
df_items = pd.read_csv('./data/items.csv', names = ['id', 'name', 'russian', 'kind', 'item_episodes', 'item_aired', 'item_score'])
df = pd.read_csv('./data/interactions.csv', names = ['user_id', 'user_name', 'rating', 'timestamp', 'item_id', 'item_name', 'item_russian'])
df_users = pd.read_csv('./data/users.csv', names = ['user_id', 'sex', 'age'])

In [3]:
animes = pd.read_csv('./data/anime_preprocessong.csv')
genres_map = pd.Series(animes['genres'].values, index=animes['id']).to_dict()
df_items['genres'] = df_items['id'].map(genres_map.get)

In [4]:
df_items['genres'] = df_items['genres'].fillna('genres_unknown')
df_items['kind'] = df_items['kind'].fillna('kind_unknown')
genres = df_items['genres'].unique()
kind = df_items['kind'].unique()
item_features = np.append(genres, kind)
item_features

array(['Action,Drama,Mystery,Supernatural,Police',
       'Comedy,Drama,Romance,Josei', 'Sports,School,Shounen', ..., 'ona',
       'music', 'kind_unknown'], dtype=object)

In [5]:
df_users['age'] = pd.qcut(df_users['age'], 4, labels=['6_24', '24_26', '26_29', '29_35']).astype('category')
df_users['age'] = df_users['age'].cat.add_categories('age_unknown')
df_users['age'] = df_users['age'].fillna('age_unknown')
age_features = df_users['age'].unique()
age_features

['age_unknown', '26_29', '6_24', '29_35', '24_26']
Categories (5, object): ['6_24' < '24_26' < '26_29' < '29_35' < 'age_unknown']

In [6]:
df_users['sex'] = np.array(df_users['sex'].astype(str))
df_users['sex'] = df_users['sex'].fillna('sex_unknown')
sex_features = df_users['sex'].unique()
sex_features

array(['male', 'nan', 'female'], dtype=object)

In [7]:
user_features = np.append(age_features, sex_features)
user_features

array(['age_unknown', '26_29', '6_24', '29_35', '24_26', 'male', 'nan',
       'female'], dtype=object)

In [8]:
df_users['features'] = df_users[['age', 'sex']].astype(str).apply(lambda x: list(x), axis=1)

In [9]:
df_items['features'] = df_items[['genres', 'kind']].astype(str).apply(lambda x: list(x), axis=1)

In [10]:
from lightfm.data import Dataset
from lightfm import LightFM
from sklearn.model_selection import KFold
from tqdm import tqdm

In [11]:
dataset = Dataset()
dataset.fit(df['user_id'].unique(), df['item_id'].unique())

In [12]:
dataset.fit_partial(user_features=user_features)
dataset.fit_partial(item_features=item_features)

In [13]:
num_users, num_items = dataset.interactions_shape()
num_users, num_items

(31526, 14717)

In [14]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'user_features_mapping': lightfm_mapping[1],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('users_mapping len - ', len(lightfm_mapping['users_mapping']))
print('user_features_mapping len - ', len(lightfm_mapping['user_features_mapping']))
print('items_mapping len - ', len(lightfm_mapping['items_mapping']))
print('Users item_features_mapping len - ', len(lightfm_mapping['item_features_mapping']))

users_mapping len -  31526
user_features_mapping len -  31534
items_mapping len -  14717
Users item_features_mapping len -  17497


In [15]:
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

In [16]:
num_user_features = dataset.user_features_shape()
num_show_features = dataset.item_features_shape()
print('Num user features: {} -> {}\nnum item features: {} -> {}.'.format(
    num_user_features[1] - num_users, num_user_features[1], 
    num_show_features[1] - num_items, num_show_features[1]))

Num user features: 8 -> 31534
num item features: 2780 -> 17497.


In [17]:
def df_to_tuple_iterator(df):
    return zip(*df.values.T)

def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [18]:
train_mat, train_mat_weights = dataset.build_interactions(df_to_tuple_iterator(df[['user_id', 'item_id']]))
train_mat

<31526x14717 sparse matrix of type '<class 'numpy.int32'>'
	with 5828721 stored elements in COOrdinate format>

In [19]:
known_users_filter = df_users['user_id'].isin(df['user_id'].unique())
train_user_features = dataset.build_user_features(
    df_to_tuple_list_iterator(
        df_users.loc[known_users_filter, ['user_id', 'features']]
    )
)
train_user_features

<31526x31534 sparse matrix of type '<class 'numpy.float32'>'
	with 93364 stored elements in Compressed Sparse Row format>

In [20]:
known_items_filter = df_items['id'].isin(df['item_id'].unique())
train_items_features = dataset.build_item_features(
    df_to_tuple_list_iterator(
        df_items.loc[known_items_filter, ['id', 'features']]
    )
)
train_items_features

<14717x17497 sparse matrix of type '<class 'numpy.float32'>'
	with 44151 stored elements in Compressed Sparse Row format>

In [21]:
lfm_model = LightFM(no_components=64, learning_rate=0.05, loss='warp', random_state=23)

In [22]:
num_epochs = 15
for _ in tqdm(range(num_epochs), total=num_epochs):
    lfm_model.fit_partial(
        train_mat, 
        user_features=train_user_features,
        item_features=train_items_features,
        num_threads=4
    )

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [06:25<00:00, 25.67s/it]


In [23]:
all_cols = list(lightfm_mapping['items_mapping'].values())
len(all_cols)

14717

In [24]:
top_N = 10
user_id = df['user_id'].iloc[0]
row_id = lightfm_mapping['users_mapping'][user_id]
print(f'Рекомендации для пользователя {user_id}, номер строки - {row_id}')

Рекомендации для пользователя 1, номер строки - 0


In [25]:
pred = lfm_model.predict(row_id, all_cols, user_features=train_user_features, item_features=train_items_features, num_threads=4)
pred, pred.shape

(array([-185.76512, -185.5178 , -187.2735 , ..., -197.6853 , -196.62538,
        -197.7492 ], dtype=float32),
 (14717,))

In [26]:
top_cols = np.argpartition(pred, -np.arange(top_N))[-top_N:][::-1]
top_cols

array([139, 148, 533,  55,  11, 132, 133, 164, 225,  27])

In [27]:
pred[top_cols]

array([-183.99722, -184.10017, -184.17398, -184.32202, -184.32478,
       -184.47348, -184.47845, -184.48257, -184.48944, -184.49306],
      dtype=float32)

In [29]:
item_titles = pd.Series(df_items['name'].values, index=df_items['id']).to_dict()

In [30]:
recs = pd.DataFrame({'col_id': top_cols})
recs['item_id'] = recs['col_id'].map(lightfm_mapping['items_inv_mapping'].get)
recs['title'] = recs['item_id'].map(item_titles.get)
recs

Unnamed: 0,col_id,item_id,title
0,139,1535,Death Note
1,148,1575,Code Geass: Hangyaku no Lelouch
2,533,226,Elfen Lied
3,55,245,Great Teacher Onizuka
4,11,71,Full Metal Panic!
5,132,1195,Zero no Tsukaima
6,133,1210,NHK ni Youkoso!
7,164,1840,Zero no Tsukaima: Futatsuki no Kishi
8,225,3712,Zero no Tsukaima: Princesses no Rondo
9,27,121,Fullmetal Alchemist


In [31]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, user_features, item_features, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, user_features=user_features, item_features=item_features, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [32]:
known_items = df.groupby('user_id')['item_id'].apply(list).to_dict()
len(known_items)

31526

In [33]:
mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=known_items,
    N=top_N,
    user_features=train_user_features, 
    item_features=train_items_features, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=4
)

In [34]:
recs = pd.DataFrame({
    'user_id': df['user_id'].unique()
})

In [35]:
%%time
recs['item_id'] = recs['user_id'].map(mapper)

CPU times: user 10min 49s, sys: 935 ms, total: 10min 50s
Wall time: 3min 2s


In [36]:
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1

In [37]:
recs['title'] = recs['item_id'].map(item_titles.get)

In [43]:
recs.tail(10)

Unnamed: 0,user_id,item_id,rank,title
31525,99999,1535,1,Death Note
31525,99999,10408,2,Hotarubi no Mori e
31525,99999,431,3,Howl no Ugoku Shiro
31525,99999,199,4,Sen to Chihiro no Kamikakushi
31525,99999,523,5,Tonari no Totoro
31525,99999,9253,6,Steins;Gate
31525,99999,10620,7,Mirai Nikki (TV)
31525,99999,22319,8,Tokyo Ghoul
31525,99999,4898,9,Kuroshitsuji
31525,99999,3588,10,Soul Eater


In [None]:
if DUMP:
    import os 
    os.system('mkdir models')
    joblib.dump(lfm_model, './models/lightfm.model')