In [193]:
from lightfm import LightFM
from scipy.sparse import coo_matrix
import pandas as pd

In [194]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
sbmt = pd.read_csv('../data/raw/submission.csv')

In [196]:
user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]

In [197]:
user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month
user_hist_df = user_hist_df[~user_hist_df['month'].isin([6, 7])]

In [198]:
user_hist_df['event'] = 1

In [199]:
user_hist_df.shape

(184263, 6)

In [200]:
user_hist_df = user_hist_df.drop_duplicates(['user_id', 'movie_id', 'event'])
train_df = user_hist_df[['user_id', 'movie_id', 'event']]

In [201]:
user_hist_df.shape

(184263, 6)

In [344]:
params = {'learning_schedule': 'adadelta',
 'no_components': 16,
 'loss': 'warp-kos',
 'num_epochs': 8,
 'learning_rate': 0.32177834477114553}

num_epochs = params.pop('num_epochs')
model = LightFM(**params)

In [202]:
model = LightFM(no_components = 10)

In [203]:
user_encode = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train_df['movie_id'].unique())}

In [204]:
user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

In [205]:
n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(2055, 4063)

In [206]:
train_df['user_id'] = train_df['user_id'].apply(lambda x: user_encode[x])

train_df['movie_id'] = train_df['movie_id'].apply(lambda x: item_encode[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [207]:
train_coo = coo_matrix(
    (train_df['event'], (train_df['user_id'], train_df['movie_id'])), shape=(n_users, n_items))

In [208]:
moveis_views = user_hist_df['movie_id'].value_counts()

items_to_recom = moveis_views[moveis_views>100].index
print(len(items_to_recom))

550


In [209]:
users_to_predict = [user_encode[x] for x in list(sbmt['user_id'])]
items_to_predict = [item_encode[x] for x in items_to_recom]

In [211]:
model.fit(train_coo, epochs=20)

<lightfm.lightfm.LightFM at 0x125148190>

In [212]:
user_hist = train_df[train_df['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()

In [213]:
items_to_predict = set(items_to_predict)

In [215]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [357]:
import sys

PATH = '/Users/danil/Documents/github/sweet_RS/'
sys.path.append(str(PATH))

from src.utils import save_to_pickle

In [358]:
save_to_pickle(recoms, '../data/processed/light_fm_tuned_validate.pickle')

# validate

In [214]:
import ml_metrics as metrics

In [216]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]
user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

test_1 = user_hist_df[user_hist_df['month'] == 6]
test_2 = user_hist_df[user_hist_df['month'] == 7]
train = user_hist_df[~user_hist_df['month'].isin([6, 7])]

In [217]:
correct_1 = test_1.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()
correct_2 = test_2.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()


In [218]:
train['event'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [219]:
train = train.drop_duplicates(['user_id', 'movie_id', 'event'])

In [220]:
model = LightFM(item_alpha = 0.1, no_components=10)

train_df = train[['user_id', 'movie_id', 'event']]

In [221]:
user_encode = {u: i for i, u in enumerate(train['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train['movie_id'].unique())}

In [222]:
user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

In [223]:
n_users, n_items = len(user_encode), len(item_encode)

n_users, n_items

(2055, 4063)

In [224]:
train['user_id'] = train['user_id'].apply(lambda x: user_encode[x])
train['movie_id'] = train['movie_id'].apply(lambda x: item_encode[x])

In [225]:
train_coo = coo_matrix(
    (train['event'], (train['user_id'], train['movie_id'])), shape=(n_users, n_items))

In [226]:
moveis_views = train['movie_id'].value_counts()

items_to_recom = moveis_views[moveis_views>100].index
print(len(items_to_recom))

550


In [227]:
users_to_predict = [user_encode[x] for x in list(test_1['user_id'])]
items_to_predict = set(items_to_recom.copy())

In [188]:
model = LightFM(item_alpha = 0.2, no_components=5)

In [228]:
model.fit(train_coo, epochs=20)

<lightfm.lightfm.LightFM at 0x124b327d0>

In [165]:
movie_features_f = movie_features.astype(np.float32)

In [170]:
from scipy.sparse import csr_matrix

In [172]:
movie_features_csr = csr_matrix(movie_features, dtype = np.float32)

In [189]:
model.fit(train_coo, item_features = movie_features_csr, epochs=20)

<lightfm.lightfm.LightFM at 0x12252d890>

In [174]:
user_hist = train_df[train_df['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()
user_hist = {user_encode[k]: v for k, v in user_hist.items()}

In [229]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    #print(len(items_to_score))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.0019391965255157438

In [251]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    #print(len(items_to_score))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.007202531645569621

In [158]:
users_to_predict = [user_encode[x] for x in list(test_2['user_id'])]

In [159]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [160]:
metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)

0.007573839662447258

on LB 0.00786

after tuning 0.00619

# hyperparam

In [193]:
import optuna

In [212]:
def validate_cv(params):
    num_epochs = params.pop('num_epochs')
    model = LightFM(**params)
    model.fit(train_coo, epochs = num_epochs)
    
    recoms = {}
    num_to_recom = 5
    for user in users_to_predict:
        #print(user)
        items_to_score = list(items_to_predict.difference(user_hist[user]))
        predict = model.predict(user, items_to_score, num_threads=-1)
        top_recoms_id = sorted(range(len(predict)),
                               key=lambda i: predict[i])[-num_to_recom:]
        top_recoms_id.reverse()
        recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                     for i in top_recoms_id]

    score = metrics.mapk(list(recoms.values()), list(correct_1.values()), 5)
    return score


def objective(trial):    
    params = {'learning_schedule': trial.suggest_categorical('learning_schedule', ["adagrad", "adadelta"]),
              'no_components': trial.suggest_int('no_components', 5, 30),
              'loss': trial.suggest_categorical('loss', ["bpr", "warp", "warp-kos"]),
              'num_epochs': trial.suggest_int('num_epochs', 5, 30),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.5)}
    
    
    return validate_cv(params)

In [259]:
#study_1 = optuna.create_study(direction="maximize")
# study = joblib.load('../optuna_checkpoints/study.pkl')
study_1.optimize(objective, n_trials=100)

[32m[I 2020-11-14 22:13:55,314][0m Trial 20 finished with value: 0.006713080168776372 and parameters: {'learning_schedule': 'adadelta', 'no_components': 19, 'loss': 'warp', 'num_epochs': 13, 'learning_rate': 0.3027783986159787}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:14:13,182][0m Trial 21 finished with value: 0.007050632911392405 and parameters: {'learning_schedule': 'adadelta', 'no_components': 25, 'loss': 'warp-kos', 'num_epochs': 20, 'learning_rate': 0.3681681528849433}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:14:32,219][0m Trial 22 finished with value: 0.007267932489451477 and parameters: {'learning_schedule': 'adadelta', 'no_components': 28, 'loss': 'warp-kos', 'num_epochs': 21, 'learning_rate': 0.35545793662831393}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:14:48,621][0m Trial 23 finished with value: 0.006869198312236287 and parameters: {'learning_schedule': 'adadel

[32m[I 2020-11-14 22:21:39,063][0m Trial 49 finished with value: 0.007261603375527426 and parameters: {'learning_schedule': 'adadelta', 'no_components': 24, 'loss': 'warp-kos', 'num_epochs': 23, 'learning_rate': 0.22476530047205406}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:21:49,126][0m Trial 50 finished with value: 0.006867088607594937 and parameters: {'learning_schedule': 'adadelta', 'no_components': 5, 'loss': 'warp', 'num_epochs': 20, 'learning_rate': 0.3952403717656194}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:22:07,539][0m Trial 51 finished with value: 0.007061181434599156 and parameters: {'learning_schedule': 'adadelta', 'no_components': 24, 'loss': 'warp-kos', 'num_epochs': 22, 'learning_rate': 0.22067113771684008}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:22:25,844][0m Trial 52 finished with value: 0.00689240506329114 and parameters: {'learning_schedule': 'adadelt

[32m[I 2020-11-14 22:29:50,353][0m Trial 78 finished with value: 0.006715189873417721 and parameters: {'learning_schedule': 'adadelta', 'no_components': 25, 'loss': 'warp-kos', 'num_epochs': 10, 'learning_rate': 0.2071015532819187}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:30:04,569][0m Trial 79 finished with value: 0.0069113924050632925 and parameters: {'learning_schedule': 'adadelta', 'no_components': 26, 'loss': 'bpr', 'num_epochs': 17, 'learning_rate': 0.26124426034264914}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:30:21,443][0m Trial 80 finished with value: 0.006845991561181435 and parameters: {'learning_schedule': 'adagrad', 'no_components': 21, 'loss': 'warp-kos', 'num_epochs': 21, 'learning_rate': 0.4303745845123337}. Best is trial 18 with value: 0.007588607594936708.[0m
[32m[I 2020-11-14 22:30:30,095][0m Trial 81 finished with value: 0.0074620253164556965 and parameters: {'learning_schedule': 'adadel

[32m[I 2020-11-14 22:34:58,156][0m Trial 107 finished with value: 0.006972573839662447 and parameters: {'learning_schedule': 'adadelta', 'no_components': 12, 'loss': 'warp-kos', 'num_epochs': 7, 'learning_rate': 0.25483377068292384}. Best is trial 90 with value: 0.007824894514767932.[0m
[32m[I 2020-11-14 22:35:06,946][0m Trial 108 finished with value: 0.007236286919831224 and parameters: {'learning_schedule': 'adadelta', 'no_components': 13, 'loss': 'bpr', 'num_epochs': 8, 'learning_rate': 0.28378460442606446}. Best is trial 90 with value: 0.007824894514767932.[0m
[32m[I 2020-11-14 22:35:17,688][0m Trial 109 finished with value: 0.006780590717299578 and parameters: {'learning_schedule': 'adagrad', 'no_components': 15, 'loss': 'warp-kos', 'num_epochs': 10, 'learning_rate': 0.38844235154959333}. Best is trial 90 with value: 0.007824894514767932.[0m
[32m[I 2020-11-14 22:35:26,606][0m Trial 110 finished with value: 0.006999999999999999 and parameters: {'learning_schedule': 'adad

In [260]:
print("Number of finished trials: {}".format(len(study_1.trials)))
print("Best trial:")
trial = study_1.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 120
Best trial:
  Value: 0.007824894514767932
  Params: 
    learning_schedule: adadelta
    no_components: 16
    loss: warp-kos
    num_epochs: 8
    learning_rate: 0.32177834477114553


In [261]:
users_to_predict = [user_encode[x] for x in list(sbmt['user_id'])]
items_to_predict = [item_encode[x] for x in items_to_recom]

KeyError: 1816

In [91]:
model.fit(train_coo, epochs=20)

<lightfm.lightfm.LightFM at 0x11d89f1d0>

In [143]:
user_hist = train_df[train_df['movie_id'] != 0].groupby('user_id')['movie_id'].apply(set).to_dict()

In [93]:
items_to_predict = set(items_to_predict)

In [94]:
recoms = {}
num_to_recom = 5
for user in users_to_predict:
    #print(user)
    items_to_score = list(items_to_predict.difference(user_hist[user]))
    predict = model.predict(user, items_to_score, num_threads=-1)
    top_recoms_id = sorted(range(len(predict)),
                           key=lambda i: predict[i])[-num_to_recom:]
    top_recoms_id.reverse()
    recoms[user_decode[user]] = [item_decode[items_to_score[i]]
                                 for i in top_recoms_id]

In [96]:
import sys

PATH = '/Users/danil/Documents/github/sweet_RS/'
sys.path.append(str(PATH))

In [98]:
from src.utils import save_to_pickle

In [99]:
save_to_pickle(recoms, '../data/processed/light_fm.pickle')

# add movie features

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [92]:
movies_database = pd.read_csv('../data/raw/movies.csv')
movies_database.head()

Unnamed: 0,id,year,title,description,genres,director,actors,writers,music,art,producer,imdb_id,tmdb_id,imdb_rating,tmdb_rating,available_now
0,1,2008,The Girl from Monaco,A brilliant and neurotic attorney goes to Mona...,"Comedy,Drama",Anne Fontaine,"Fabrice Luchini,Helene de Saint-Pere,Jeanne Ba...","Anne Fontaine,Benoît Graffin",Philippe Rombi,,"Philippe Carcassonne,Bruno Pesary,Christine Ra...",tt1139800,15342.0,5.8,4.8,yes
1,2,2008,Every Jack has a Jill,Jack is encouraged to take the romantic Paris ...,"Comedy,Melodrama",Jennifer Devoldère,"Justin Lee Bartha,Billy Boyd,Maurice Bénichou,...",Jennifer Devoldère,,"Hervé Gallet,Alix Deschamps","Bruno Chiche,Nicolas Duval Adassovsky,Maxime R...",tt1094668,32338.0,6.1,5.4,yes
2,3,2009,Dorian Gray,A vain London playboy offers his soul in excha...,"Thriller,Sci-Fi",Oliver Parker,"Colin Firth,Nathan Rosen,Caroline Goodall,John...","Toby Finlay,Oscar Wilde",Charlie Mole,"John Beard,Ruth Myers,Niamh Coulter","Barnaby Thompson,Paul Brett,Simon Fawcett",tt1235124,23488.0,6.3,5.9,yes
3,4,2008,How to Lose Friends & Alienate People,A British writer struggles to fit in at a high...,"Comedy,Drama",Robert B. Willow,"Simon Pegg,Jeff Bridges,Danny Huston,Jillian A...","Peter Straughan,Toby Young",David Arnold,"John Beard,Ray Chan,Anthony Gasparro","Elizabeth Karlsen,Laurie Borg,Stephen Woolley",tt0455538,13092.0,6.4,6.1,yes
4,5,2009,Cell 211,The story of two men on different sides of a p...,"Action,Thriller",Daniel Monzon,"Luis Tosar,Vicente Romero,Fernando Soto,Luis Z...","Daniel Monzon,F.P. Gandull,Jorge Guerricaechev...",Roque Baños,"Antón Laguna,Montse Sancho","Álvaro Augustín,Juan Gordon,Emma Lustres",tt1242422,33273.0,7.6,7.4,yes


In [94]:
movies_database = movies_database[movies_database['id'].isin(train_df['movie_id'].unique())]
item_features = movies_database[['id', 'year', 'genres', 'imdb_rating', 'tmdb_rating']]
item_features.shape

(4298, 16)

In [96]:
def convert_year(val):
    if val < 2000:
        cat = 0
    elif 2000<=val < 2010:
        cat = 1
    else:
        cat = 2
    return cat

item_features['year'] = item_features['year'].apply(convert_year)

user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
movies = user_hist_df[user_hist_df['episode_id'] == 0]['movie_id'].unique()

item_features['movie'] = item_features['id'].apply(lambda x: 1 if x in movies else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [97]:
test = item_features['genres'].fillna('hz').str.split(',')

mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(test),
                   columns=mlb.classes_,
                   index=test.index)

to_drop = ['Art House', 'Documentary', 'Family','Musical', 'Quarantine Info', 'hz', 'War']
res = res.drop(to_drop, 1)

In [98]:
item_features = pd.concat([item_features, res], 1).drop('genres', 1)

In [100]:
item_features = item_features.fillna(item_features.mean())

In [101]:
item_features['id'] = item_features['id'].apply(lambda x: item_encode[x])

In [109]:
missed_movies = set(item_decode.keys()).difference(set(item_features['id'].values))

item_features = item_features.sort_values('id').reset_index().drop('index', 1)

In [123]:
ind = 4298
for _id in missed_movies:
    item_features.loc[ind] = [_id] + [-1]*21
    ind+=1
    
item_features = item_features.sort_values('id')

In [139]:
movie_features = item_features.drop('id', 1).values

In [140]:
movie_features.shape

(4732, 21)