In [2]:
from lightfm import LightFM
from scipy.sparse import coo_matrix
import pandas as pd
import ml_metrics as metrics

# validate

In [72]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')

user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

test_1 = user_hist_df[user_hist_df['month'] == 6]
test_2 = user_hist_df[user_hist_df['month'] == 7]
user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]
train = user_hist_df[~user_hist_df['month'].isin([6, 7])]

In [73]:
correct_1 = test_1.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()
correct_2 = test_2.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()


In [74]:
train['event'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [75]:
train = train.drop_duplicates(['user_id', 'movie_id', 'event'])

In [76]:
model = LightFM(loss='logistic', no_components=10, random_state = 42)

train_df = train[['user_id', 'movie_id', 'event']]

In [77]:
user_encode = {u: i for i, u in enumerate(train['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train['movie_id'].unique())}

user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(2055, 4063)

In [78]:
train['user_id'] = train['user_id'].apply(lambda x: user_encode[x])
train['movie_id'] = train['movie_id'].apply(lambda x: item_encode[x])

In [79]:
train_coo = coo_matrix(
    (train['event'], (train['user_id'], train['movie_id'])), shape=(n_users, n_items))

In [87]:
movies_database = pd.read_csv('../data/raw/movies.csv')
new_movies = movies_database[(movies_database['year']>= 2010) & 
                             (movies_database['id'].isin(list(item_encode.keys()))) &
                             (movies_database['imdb_rating']>= 6)]['id'].values

moveis_views = train[train['movie_id'].isin([item_encode[x] for x in new_movies])]['movie_id'].value_counts()

items_to_recom = moveis_views[moveis_views>20].index
print(len(items_to_recom))

792


In [88]:
users_to_predict = [user_encode[x] for x in list(test_1['user_id'])]
items_to_predict = set(items_to_recom.copy())

In [89]:
model.fit(train_coo, epochs=20)

<lightfm.lightfm.LightFM at 0x11cb54a10>

In [90]:
user_hist = train_df[train_df['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()
user_hist = {user_encode[k]: v for k, v in user_hist.items()}

In [96]:
recoms = {}
num_to_recom = 5

reco = []
corr = []


for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    #items_to_score = list(items_to_predict)
    #print(len(items_to_score))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]
    
    reco.append(recoms[user_decode[user]])
    corr.append(correct_1[user_decode[user]])

metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.008411392405063292

0.009470141150922911

In [92]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

fav_serials = user_hist_df[(user_hist_df['episode_id'] != 0) & 
                           (user_hist_df['month'] == 5)].groupby(['user_id', 'movie_id'])['movie_id'].count()
fav_serials = fav_serials[fav_serials>=2].reset_index(name='cnt')

g = fav_serials.groupby(["user_id"]).apply(lambda x: x.sort_values(["cnt"], ascending = False)).reset_index(drop=True)

user_top_5_serials = g.groupby(["user_id"])['movie_id'].apply(lambda x: list(x[:5])).to_dict()



In [93]:
recoms_f = {}
num_to_recom = 5
for user in users_to_predict:
    user = user_decode[user]
    if user_top_5_serials.get(user):
    
        pref_serials = user_top_5_serials[user][:1]
        num_of_serials = len(pref_serials)

        if num_of_serials < 5:
            additional = 5 - num_of_serials
            light_fm_pred = recoms[user][:additional]

            pred = pref_serials + light_fm_pred
            
        else:
            pred = pref_serials
    else:
         pred = recoms[user]
            
    recoms_f[user] = pred

metrics.mapk(list(recoms_f.values()), list(correct_1.values()), 5)

0.00840084388185654

In [27]:
users_to_predict = [user_encode[x] for x in list(test_2['user_id'])]

In [28]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [29]:
metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.009548317046688381

on LB 0.00786

after tuning 0.00619