In [262]:
from lightfm import LightFM
from scipy.sparse import coo_matrix
import pandas as pd

In [263]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
sbmt = pd.read_csv('../data/raw/submission.csv')

In [264]:
user_hist_df['event'] = 1

In [80]:
user_hist_df.shape

(346408, 5)

In [265]:
user_hist_df = user_hist_df.drop_duplicates(['user_id', 'movie_id', 'event'])

In [266]:
user_hist_df.shape

(241336, 5)

In [270]:
params = {'learning_schedule': 'adadelta',
 'no_components': 16,
 'loss': 'warp-kos',
 'num_epochs': 8,
 'learning_rate': 0.32177834477114553}

In [271]:
num_epochs = params.pop('num_epochs')

model = LightFM(**params)

train_df = user_hist_df[['user_id', 'movie_id', 'event']]

In [272]:
user_encode = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train_df['movie_id'].unique())}

In [273]:
user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

In [274]:
n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(2055, 4843)

In [275]:
train_df['user_id'] = train_df['user_id'].apply(lambda x: user_encode[x])

train_df['movie_id'] = train_df['movie_id'].apply(lambda x: item_encode[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [276]:
train_coo = coo_matrix(
    (train_df['event'], (train_df['user_id'], train_df['movie_id'])), shape=(n_users, n_items))

In [278]:
moveis_views = user_hist_df['movie_id'].value_counts()

items_to_recom = moveis_views[moveis_views>100].index
print(len(items_to_recom))

717


In [279]:
users_to_predict = [user_encode[x] for x in list(sbmt['user_id'])]
items_to_predict = [item_encode[x] for x in items_to_recom]

In [280]:
model.fit(train_coo, epochs=num_epochs)

<lightfm.lightfm.LightFM at 0x127cb97d0>

In [281]:
user_hist = train_df[train_df['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()

In [282]:
items_to_predict = set(items_to_predict)

In [283]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [284]:
import sys

PATH = '/Users/danil/Documents/github/sweet_RS/'
sys.path.append(str(PATH))

In [285]:
from src.utils import save_to_pickle

In [286]:
save_to_pickle(recoms, '../data/processed/light_fm_tuned.pickle')

# validate

In [215]:
import ml_metrics as metrics

In [216]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

test_1 = user_hist_df[user_hist_df['month'] == 6]
test_2 = user_hist_df[user_hist_df['month'] == 7]
train = user_hist_df[~user_hist_df['month'].isin([6, 7])]

In [217]:
correct_1 = test_1.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()
correct_2 = test_2.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()


In [218]:
train['event'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [219]:
train = train.drop_duplicates(['user_id', 'movie_id', 'event'])

In [220]:
model = LightFM(no_components=10)

train_df = train[['user_id', 'movie_id', 'event']]

In [221]:
user_encode = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train_df['movie_id'].unique())}

In [222]:
user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

In [223]:
n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(2055, 4732)

In [224]:
train_df['user_id'] = train_df['user_id'].apply(lambda x: user_encode[x])
train_df['movie_id'] = train_df['movie_id'].apply(lambda x: item_encode[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [225]:
train_coo = coo_matrix(
    (train_df['event'], (train_df['user_id'], train_df['movie_id'])), shape=(n_users, n_items))

In [257]:
moveis_views = train_df['movie_id'].value_counts()

items_to_recom = moveis_views[moveis_views>100].index
print(len(items_to_recom))

583


In [249]:
users_to_predict = [user_encode[x] for x in list(test_1['user_id'])]
items_to_predict = set(items_to_recom.copy())

In [228]:
model.fit(train_coo, epochs=20)

<lightfm.lightfm.LightFM at 0x11d091610>

In [229]:
user_hist = train[train['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()
user_hist = {user_encode[k]: v for k, v in user_hist.items()}

In [251]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    #print(len(items_to_score))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.007202531645569621

In [158]:
users_to_predict = [user_encode[x] for x in list(test_2['user_id'])]

In [159]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [160]:
metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.007573839662447258

on LB 0.00786

after tuning 0.00619

# hyperparam

In [193]:
import optuna

In [212]:
def validate_cv(params):
    num_epochs = params.pop('num_epochs')
    model = LightFM(**params)
    model.fit(train_coo, epochs = num_epochs)
    
    recoms = {}
    num_to_recom = 5
    for user in users_to_predict:
        #print(user)
        items_to_score = list(items_to_predict.difference(user_hist[user]))
        predict = model.predict(user, items_to_score, num_threads=-1)
        top_recoms_id = sorted(range(len(predict)),
                               key=lambda i: predict[i])[-num_to_recom:]
        top_recoms_id.reverse()
        recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                     for i in top_recoms_id]

    score = metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)
    return score


def objective(trial):    
    params = {'learning_schedule': trial.suggest_categorical('learning_schedule', ["adagrad", "adadelta"]),
              'no_components': trial.suggest_int('no_components', 5, 30),
              'loss': trial.suggest_categorical('loss', ["bpr", "warp", "warp-kos"]),
              'num_epochs': trial.suggest_int('num_epochs', 5, 30),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.5)}
    
    
    return validate_cv(params)

In [259]:
#study_1 = optuna.create_study(direction="maximize")
# study = joblib.load('../optuna_checkpoints/study.pkl')
study_1.optimize(objective, n_trials=100)

[32m[I 2020-11-14 22:13:55,314][0m Trial 20 finished with value: 0.006713080168776372 and parameters: {'learning_schedule': 'adadelta', 'no_components': 19, 'loss': 'warp', 'num_epochs': 13, 'learning_rate': 0.3027783986159787}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:14:13,182][0m Trial 21 finished with value: 0.007050632911392405 and parameters: {'learning_schedule': 'adadelta', 'no_components': 25, 'loss': 'warp-kos', 'num_epochs': 20, 'learning_rate': 0.3681681528849433}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:14:32,219][0m Trial 22 finished with value: 0.007267932489451477 and parameters: {'learning_schedule': 'adadelta', 'no_components': 28, 'loss': 'warp-kos', 'num_epochs': 21, 'learning_rate': 0.35545793662831393}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:14:48,621][0m Trial 23 finished with value: 0.006869198312236287 and parameters: {'learning_schedule': 'adadel

[32m[I 2020-11-14 22:21:39,063][0m Trial 49 finished with value: 0.007261603375527426 and parameters: {'learning_schedule': 'adadelta', 'no_components': 24, 'loss': 'warp-kos', 'num_epochs': 23, 'learning_rate': 0.22476530047205406}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:21:49,126][0m Trial 50 finished with value: 0.006867088607594937 and parameters: {'learning_schedule': 'adadelta', 'no_components': 5, 'loss': 'warp', 'num_epochs': 20, 'learning_rate': 0.3952403717656194}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:22:07,539][0m Trial 51 finished with value: 0.007061181434599156 and parameters: {'learning_schedule': 'adadelta', 'no_components': 24, 'loss': 'warp-kos', 'num_epochs': 22, 'learning_rate': 0.22067113771684008}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:22:25,844][0m Trial 52 finished with value: 0.00689240506329114 and parameters: {'learning_schedule': 'adadelt

[32m[I 2020-11-14 22:29:50,353][0m Trial 78 finished with value: 0.006715189873417721 and parameters: {'learning_schedule': 'adadelta', 'no_components': 25, 'loss': 'warp-kos', 'num_epochs': 10, 'learning_rate': 0.2071015532819187}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:30:04,569][0m Trial 79 finished with value: 0.0069113924050632925 and parameters: {'learning_schedule': 'adadelta', 'no_components': 26, 'loss': 'bpr', 'num_epochs': 17, 'learning_rate': 0.26124426034264914}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:30:21,443][0m Trial 80 finished with value: 0.006845991561181435 and parameters: {'learning_schedule': 'adagrad', 'no_components': 21, 'loss': 'warp-kos', 'num_epochs': 21, 'learning_rate': 0.4303745845123337}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:30:30,095][0m Trial 81 finished with value: 0.0074620253164556965 and parameters: {'learning_schedule': 'adadel

[32m[I 2020-11-14 22:34:58,156][0m Trial 107 finished with value: 0.006972573839662447 and parameters: {'learning_schedule': 'adadelta', 'no_components': 12, 'loss': 'warp-kos', 'num_epochs': 7, 'learning_rate': 0.25483377068292384}. Best is trial 90 with value: 0.007824894514767932.[0m
[32m[I 2020-11-14 22:35:06,946][0m Trial 108 finished with value: 0.007236286919831224 and parameters: {'learning_schedule': 'adadelta', 'no_components': 13, 'loss': 'bpr', 'num_epochs': 8, 'learning_rate': 0.28378460442606446}. Best is trial 90 with value: 0.007824894514767932.[0m
[32m[I 2020-11-14 22:35:17,688][0m Trial 109 finished with value: 0.006780590717299578 and parameters: {'learning_schedule': 'adagrad', 'no_components': 15, 'loss': 'warp-kos', 'num_epochs': 10, 'learning_rate': 0.38844235154959333}. Best is trial 90 with value: 0.007824894514767932.[0m
[32m[I 2020-11-14 22:35:26,606][0m Trial 110 finished with value: 0.006999999999999999 and parameters: {'learning_schedule': 'adad

In [260]:
print("Number of finished trials: {}".format(len(study_1.trials)))
print("Best trial:")
trial = study_1.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 120
Best trial:
  Value: 0.007824894514767932
  Params: 
    learning_schedule: adadelta
    no_components: 16
    loss: warp-kos
    num_epochs: 8
    learning_rate: 0.32177834477114553


In [261]:
users_to_predict = [user_encode[x] for x in list(sbmt['user_id'])]
items_to_predict = [item_encode[x] for x in items_to_recom]

KeyError: 1816

In [91]:
model.fit(train_coo, epochs=20)

<lightfm.lightfm.LightFM at 0x11d89f1d0>

In [143]:
user_hist = train_df[train_df['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()

In [93]:
items_to_predict = set(items_to_predict)

In [94]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [96]:
import sys

PATH = '/Users/danil/Documents/github/sweet_RS/'
sys.path.append(str(PATH))

In [98]:
from src.utils import save_to_pickle

In [99]:
save_to_pickle(recoms, '../data/processed/light_fm.pickle')