In [1]:
import pandas as pd 
import numpy as np
from ast import literal_eval
from sklearn.model_selection import KFold
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

In [2]:
df_items = pd.read_pickle('./test_items.pickle')
df = pd.read_pickle('./test_df.pickle')
df_users = pd.read_pickle('./test_df_users.pickle')

In [39]:
df_items

Unnamed: 0,id,name,russian,kind,score,aired_on,rating,episodes,fansubbers,fandubbers,genres,wr,features
0,1,Cowboy Bebop,Ковбой Бибоп,tv,8.75,1998-04-03,r_plus,26,"Dragon'Drop,Max Skuratov,Сергей Светличный,Suz...","Amazing Dubbing,Е. Лурье,SHIZA Project,SkyFy,D...","Action,Sci-Fi,Space",8.382060,"[Action,Sci-Fi,Space, tv]"
1,5,Cowboy Bebop: Tengoku no Tobira,Ковбой Бибоп: Достучаться до небес,movie,8.38,2001-09-01,r_plus,1,"А. Лапшин,UR-Goliath,5viN,Faddeich","SHIZA Project,3df voice,ВидеоСервис,НТВ Плюс,R...","Action,Sci-Fi,Space",7.602360,"[Action,Sci-Fi,Space, movie]"
2,6,Trigun,Триган,tv,8.22,1998-04-01,pg_13,26,"Schtirlitz & Nomad,Мега-Аниме","Digital Force,Мега-Аниме,QTV","Action,Adventure,Sci-Fi,Shounen",7.676009,"[Action,Adventure,Sci-Fi,Shounen, tv]"
3,7,Witch Hunter Robin,Робин — охотница на ведьм,tv,7.25,2002-07-03,pg_13,26,Svart,"AniDUB,Е. Лурье","Action,Drama,Mystery,Supernatural,Police",6.284465,"[Action,Drama,Mystery,Supernatural,Police, tv]"
4,8,Bouken Ou Beet,Приключения короля Бита,tv,6.94,2004-09-30,pg,52,,Get Smart Group,"Shounen,Adventure,Fantasy,Supernatural",5.935496,"[Shounen,Adventure,Fantasy,Supernatural, tv]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9388,25983,Private Eye Dol,Личный айдол,special,0.00,1995-08-12,pg_13,3,,,"Comedy,Mystery",5.912322,"[Comedy,Mystery, special]"
9389,25985,Contact,Контакт,special,4.84,2004-06-21,g,1,,,"Kids,Comedy,Sci-Fi",5.912062,"[Kids,Comedy,Sci-Fi, special]"
9390,25987,Tensai Bakavon: Yomigaeru Flanders no Inu,"Прости нас, Патраш!",movie,0.00,2015-05-23,g,1,,,"Comedy,Parody",5.916128,"[Comedy,Parody, movie]"
9391,25991,Bishoujo Mobage: Mobami-chan,Волшебница из мобильной игры — Мобами,ona,5.35,2014-08-14,pg_13,1,,,"Game,Music",5.907781,"[Game,Music, ona]"


In [3]:
dataset = Dataset()
dataset.fit(df['user_id'].unique(), df['item_id'].unique())

In [4]:
genres = df_items['genres'].unique()
kind = df_items['kind'].unique()
item_features = np.append(genres, kind)
item_features

array(['Action,Sci-Fi,Space', 'Action,Adventure,Sci-Fi,Shounen',
       'Action,Drama,Mystery,Supernatural,Police', ..., 'ona', 'music',
       nan], dtype=object)

In [5]:
age = df_users['age'].unique()
sex = df_users['sex'].unique()
user_features = np.append(age, sex)
user_features

array(['age_unknown', '26_29', '6_24', '29_35', '24_26', 'male', 'nan',
       'female'], dtype=object)

In [41]:
user_features

array(['age_unknown', '26_29', '6_24', '29_35', '24_26', 'male', 'nan',
       'female'], dtype=object)

In [42]:
item_features

array(['Action,Sci-Fi,Space', 'Action,Adventure,Sci-Fi,Shounen',
       'Action,Drama,Mystery,Supernatural,Police', ..., 'ona', 'music',
       nan], dtype=object)

In [6]:
df_users['features'] = df_users[['age', 'sex']].astype(str).apply(lambda x: list(x), axis=1)

In [7]:
df_items['features'] = df_items[['genres', 'kind']].astype(str).apply(lambda x: list(x), axis=1)

In [8]:
dataset.fit_partial(user_features=user_features)
dataset.fit_partial(item_features=item_features)

In [9]:
num_users, num_items = dataset.interactions_shape()
num_users, num_items

(207, 5720)

In [10]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'user_features_mapping': lightfm_mapping[1],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('users_mapping len - ', len(lightfm_mapping['users_mapping']))
print('user_features_mapping len - ', len(lightfm_mapping['user_features_mapping']))
print('items_mapping len - ', len(lightfm_mapping['items_mapping']))
print('Users item_features_mapping len - ', len(lightfm_mapping['item_features_mapping']))

users_mapping len -  207
user_features_mapping len -  215
items_mapping len -  5720
Users item_features_mapping len -  8582


In [11]:
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

In [12]:
num_user_features = dataset.user_features_shape()
num_show_features = dataset.item_features_shape()
print('Num user features: {} -> {}\nnum item features: {} -> {}.'.format(
    num_user_features[1] - num_users, num_user_features[1], 
    num_show_features[1] - num_items, num_show_features[1]))

Num user features: 8 -> 215
num item features: 2862 -> 8582.


In [13]:
def df_to_tuple_iterator(df):
    return zip(*df.values.T)

def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [14]:
train_mat, train_mat_weights = dataset.build_interactions(df_to_tuple_iterator(df[['user_id', 'item_id']]))
train_mat

<207x5720 sparse matrix of type '<class 'numpy.int32'>'
	with 50078 stored elements in COOrdinate format>

In [15]:
known_users_filter = df_users['user_id'].isin(df['user_id'].unique())
train_user_features = dataset.build_user_features(
    df_to_tuple_list_iterator(
        df_users.loc[known_users_filter, ['user_id', 'features']]
    )
)
train_user_features

<207x215 sparse matrix of type '<class 'numpy.float32'>'
	with 611 stored elements in Compressed Sparse Row format>

In [16]:
known_items_filter = df_items['id'].isin(df['item_id'].unique())
train_items_features = dataset.build_item_features(
    df_to_tuple_list_iterator(
        df_items.loc[known_items_filter, ['id', 'features']]
    )
)
train_items_features

<5720x8582 sparse matrix of type '<class 'numpy.float32'>'
	with 14556 stored elements in Compressed Sparse Row format>

In [17]:
lfm_model = LightFM(no_components=64, learning_rate=0.05, loss='warp', random_state=23)

In [18]:
num_epochs = 15
for _ in tqdm(range(num_epochs), total=num_epochs):
    lfm_model.fit_partial(
        train_mat, 
        user_features=train_user_features,
        item_features=train_items_features,
        num_threads=4
    )

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  4.05it/s]


In [19]:
all_cols = list(lightfm_mapping['items_mapping'].values())
len(all_cols)

5720

In [20]:
top_N = 10
user_id = df['user_id'].iloc[0]
row_id = lightfm_mapping['users_mapping'][user_id]
print(f'Рекомендации для пользователя {user_id}, номер строки - {row_id}')

Рекомендации для пользователя 1, номер строки - 0


In [21]:
pred = lfm_model.predict(row_id, all_cols, user_features=train_user_features, item_features=train_items_features, num_threads=4)
pred, pred.shape

(array([-27.947248, -27.71055 , -28.320185, ..., -34.127213, -34.68726 ,
        -33.90762 ], dtype=float32),
 (5720,))

In [22]:
top_cols = np.argpartition(pred, -np.arange(top_N))[-top_N:][::-1]
top_cols

array([113, 139, 243,  55, 132, 168, 312, 178, 200, 148])

In [23]:
pred[top_cols]

array([-24.440556, -24.522242, -24.555445, -24.771917, -24.8423  ,
       -25.076168, -25.079773, -25.142256, -25.16427 , -25.259272],
      dtype=float32)

In [24]:
np.max(pred)

-24.440556

In [25]:
#items = pd.read_csv('./test.csv', names=['id', 'name', 'russian', 'score', 'aired', 'duration', 'genres', 'rates'])
item_titles = pd.Series(df_items['name'].values, index=df_items['id']).to_dict()

In [26]:
recs = pd.DataFrame({'col_id': top_cols})
recs['item_id'] = recs['col_id'].map(lightfm_mapping['items_inv_mapping'].get)
recs['title'] = recs['item_id'].map(item_titles.get)
recs

Unnamed: 0,col_id,item_id,title
0,113,849,Suzumiya Haruhi no Yuuutsu
1,139,1535,Death Note
2,243,4224,Toradora!
3,55,245,Great Teacher Onizuka
4,132,1195,Zero no Tsukaima
5,168,2025,Darker than Black: Kuro no Keiyakusha
6,312,6547,Angel Beats!
7,178,2167,Clannad
8,200,2966,Ookami to Koushinryou
9,148,1575,Code Geass: Hangyaku no Lelouch


In [27]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, user_features, item_features, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, user_features=user_features, item_features=item_features, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [28]:
known_items = df.groupby('user_id')['item_id'].apply(list).to_dict()
len(known_items)

207

In [29]:
mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=known_items,
    N=top_N,
    user_features=train_user_features, 
    item_features=train_items_features, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=4
)

In [30]:
recs = pd.DataFrame({
    'user_id': df['user_id'].unique()
})

In [31]:
%%time
recs['item_id'] = recs['user_id'].map(mapper)

CPU times: user 2.62 s, sys: 7.48 ms, total: 2.63 s
Wall time: 907 ms


In [32]:
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1

In [33]:
recs['title'] = recs['item_id'].map(item_titles.get)

In [34]:
recs

Unnamed: 0,user_id,item_id,rank,title
0,1,849,1,Suzumiya Haruhi no Yuuutsu
0,1,1535,2,Death Note
0,1,4224,3,Toradora!
0,1,245,4,Great Teacher Onizuka
0,1,1195,5,Zero no Tsukaima
...,...,...,...,...
206,250056,11757,6,Sword Art Online
206,250056,29803,7,
206,250056,29786,8,
206,250056,32937,9,


In [35]:
demid_recs = recs.tail(10)

In [36]:
demid_recs

Unnamed: 0,user_id,item_id,rank,title
206,250056,31240,1,
206,250056,28121,2,
206,250056,30831,3,
206,250056,31043,4,
206,250056,26243,5,
206,250056,11757,6,Sword Art Online
206,250056,29803,7,
206,250056,29786,8,
206,250056,32937,9,
206,250056,28171,10,


In [37]:
mapper = df[['item_id', 'item_name']].drop_duplicates()

In [38]:
pd.merge(demid_recs, mapper, left_on='item_id', right_on='item_id',  how='left')

Unnamed: 0,user_id,item_id,rank,title,item_name
0,250056,31240,1,,Re:Zero kara Hajimeru Isekai Seikatsu
1,250056,28121,2,,Dungeon ni Deai wo Motomeru no wa Machigatteir...
2,250056,30831,3,,Kono Subarashii Sekai ni Shukufuku wo!
3,250056,31043,4,,Boku dake ga Inai Machi
4,250056,26243,5,,Owari no Seraph
5,250056,11757,6,Sword Art Online,Sword Art Online
6,250056,29803,7,,Overlord
7,250056,29786,8,,Shimoneta to Iu Gainen ga Sonzai Shinai Taikut...
8,250056,32937,9,,Kono Subarashii Sekai ni Shukufuku wo! 2
9,250056,28171,10,,Shokugeki no Souma
