https://github.com/massquantity/LibRecommender

In [1]:
import implicit
from scipy.sparse import coo_matrix

import pandas as pd
import ml_metrics as metrics

import lightgbm as lgb
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import random
from collections import Counter


In [4]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')

user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

test_1 = user_hist_df[user_hist_df['month'] == 6]
test_2 = user_hist_df[user_hist_df['month'] == 7]
user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]
train = user_hist_df[~user_hist_df['month'].isin([6, 7])]

In [5]:
correct_1 = test_1.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()
correct_2 = test_2.drop_duplicates(['user_id', 'movie_id']).groupby('user_id')['movie_id'].apply(list).to_dict()


In [6]:
train['event'] = 1
train = train[['user_id', 'movie_id', 'event']].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
user_encode = {u: i for i, u in enumerate(train['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train['movie_id'].unique())}

user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

n_users, n_items = len(user_encode), len(item_encode)

train['user_id'] = train['user_id'].apply(lambda x: user_encode[x])
train['movie_id'] = train['movie_id'].apply(lambda x: item_encode[x])

In [8]:
item_user = coo_matrix(
    (train['event'], (train['user_id'], train['movie_id'])), shape=(n_users, n_items))

user_item = coo_matrix(
    (train['event'], (train['movie_id'], train['user_id'])), shape=(n_items, n_users))

In [9]:
user_item.shape

(4063, 2055)

In [10]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=5, iterations= 15)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(user_item, show_progress=False)



In [11]:
movies_database = pd.read_csv('../data/raw/movies.csv')
useless_movies = movies_database[(movies_database['year'] < 2010) & 
                                 (~movies_database['id'].isin(list(item_encode.keys()))) &
                                 (movies_database['imdb_rating'] < 6)]['id'].values

useless_movies = [m for m in useless_movies if m in item_decode]

In [12]:
# als
res = model.recommend_all(item_user, N = 100, filter_items = useless_movies, show_progress=False)

In [13]:
recom = {}

for ind, rec in enumerate(res):
    recom[user_decode[ind]] = [item_decode[x] for x in rec]

In [14]:
# als
reco = []
corr = []

num_to_recom = 5
for user in correct_1:
    try:
        reco.append(recom[user])
        corr.append(correct_1[user])
    except:
        print(user)
        pass

metrics.mapk(reco, corr, 5)

0.14181012658227848

# LGBM

In [16]:
user_features = model.user_factors

get item features

In [17]:



movies_database = pd.read_csv('../data/raw/movies.csv')

# movies_database = movies_database[movies_database['id'].isin(list(item_encode.keys()))]
item_features = movies_database[['id', 'year', 'genres', 'imdb_rating', 'tmdb_rating']]

def convert_year(val):
    if val < 2000:
        cat = 0
    elif 2000<=val < 2010:
        cat = 1
    else:
        cat = 2
    return cat

item_features['year'] = item_features['year'].apply(convert_year)

user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
movies = user_hist_df[user_hist_df['episode_id'] == 0]['movie_id'].unique()

item_features['movie'] = item_features['id'].apply(lambda x: 1 if x in movies else 0 )

test = item_features['genres'].fillna('hz').str.split(',')

mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(test),
                   columns=mlb.classes_,
                   index=test.index)

to_drop = ['Art House', 'Documentary', 'Family','Musical', 'Quarantine Info', 'hz', 'War']
res = res.drop(to_drop, 1)

item_features = pd.concat([item_features, res], 1).drop('genres', 1)

item_features = item_features.fillna(item_features.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
item_features = item_features.drop('movie', 1)

In [19]:
user_features_df = pd.DataFrame(user_features)
user_features_df['user_id'] = user_encode.values()

In [20]:
rank_df = pd.merge(train, user_features_df, how='left', on='user_id')

In [21]:
rank_df = pd.merge(rank_df, item_features, how = 'left', 
                   left_on ='movie_id', 
                   right_on='id')

In [22]:
rank_df.shape

(184263, 34)

In [23]:
rank_df = rank_df.dropna()

generate negative examples

2. from movies found 20 films from other genres 
3. and old ones, 
4. with small rating

In [24]:
user_hist = train.groupby('user_id')['movie_id'].apply(set)
users_to_predict = train['user_id'].unique()

In [25]:
movie_genres = movies_database[['id', 'genres']]
movie_genres['genres'] = movie_genres['genres'].str.split(',')
#movie_genres['id'] = movie_genres['id'].apply(lambda x: item_encode.get(x, x + 1000000000) )

movie_genres_dict = {}

for i in movie_genres.iterrows():
    movie_genres_dict[i[1][0]] = i[1][1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
movie_genres_dict = {k: v for k, v in movie_genres_dict.items() if isinstance(v, list)}

вот тут еще можно поиграться с кол вом жанров которые нужно учесть

In [28]:
user_pref = {}
for user in users_to_predict:
    #print(user)
    user_pref[user] = {}
    #print(user_hist[user])
    for film in user_hist[user]:
        film = item_decode[film]
        if movie_genres_dict.get(film):
            for genre in movie_genres_dict[film]:
                if user_pref[user].get(genre):
                    user_pref[user][genre] += 1
                else:
                    user_pref[user][genre] = 1
                    
    user_pref[user] = [key for key, val in Counter(user_pref[user]).most_common(5)]                



In [30]:
neg_examples = 20

In [31]:
user_negative = {}

for user in user_pref:
    negative = []
    current_user_pref = set(user_pref[user])
    
    num = 0
    while num < neg_examples:
        random_film = random.sample(list(movie_genres_dict), 1)[0]
        curr_film_genres = set(movie_genres_dict[random_film])
        if len(current_user_pref.intersection(curr_film_genres)) == 0:
            negative.append(random_film)
            num += 1
            
    user_negative[user] = negative
      

In [32]:
users = []
films = []


for user in user_negative:
    users += [user] * neg_examples
    films += user_negative[user]
    
    
negative_df = pd.DataFrame()
negative_df['user_id'] = users
negative_df['movie_id'] = films

In [33]:
negative_df = pd.merge(negative_df, user_features_df, how='left', on='user_id')

In [34]:
negative_df = pd.merge(negative_df, item_features, how = 'left', 
                       left_on ='movie_id', 
                       right_on='id')
negative_df['event'] = 0

In [35]:
negative_df.shape

(41100, 34)

In [36]:
rank_df.shape

(182074, 34)

In [37]:
#X = pd.concat([negative_df[:10000], rank_df[:10000]], 0)
X = pd.concat([negative_df[:], rank_df[:]], 0)

In [53]:
X.iloc[:, 11:] = X.iloc[:, 11:].astype('int8')
X.iloc[:, 2:7] = X.iloc[:, 2:7].astype('float16')

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.drop(['user_id', 'movie_id', 'id', 'event'], 1), 
                                                    X['event'], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [44]:
X_train = X_train[:50000]
y_train = y_train[:50000]

In [56]:
query_train = [X_train.shape[0]]
query_val = [X_val.shape[0]]
query_test = [X_test.shape[0]]

In [57]:
X_train.shape

(142831, 30)

In [62]:
del gbm

In [58]:
gbm = lgb.LGBMRanker(max_depth = 5, n_estimators = 20, random_state = 42, 
                    n_jobs=4)

In [59]:
gbm.fit(X_train, y_train, group=query_train,
        eval_set=[(X_val, y_val)], eval_group=[query_val],
        eval_at=[5, 10], early_stopping_rounds=50)

[1]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[3]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[4]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[5]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[6]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[7]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[8]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[9]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[10]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[11]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[12]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[13]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[14]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[15]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[16]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[17]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[18]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[19]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
[20]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1
Did not meet early stopping.

LGBMRanker(max_depth=5, n_estimators=20, n_jobs=4, random_state=42)

validate

In [60]:
users = []
films = []


for user in recom:
    users += [user_encode[user]] * 100
    films += recom[user]
    
validate_df = pd.DataFrame()
validate_df['user_id'] = users
validate_df['movie_id'] = films

In [61]:
validate_df = pd.merge(validate_df, user_features_df, how='left', on='user_id')

validate_df = pd.merge(validate_df, item_features, how = 'left', 
                       left_on ='movie_id', 
                       right_on='id')


validate_df = validate_df.dropna()


In [62]:
test_pred = gbm.predict(validate_df.drop(['user_id', 'movie_id', 'id'], 1))
validate_df['pred'] = test_pred

In [63]:
g = validate_df.groupby(['user_id']).apply(lambda x: x.nlargest(5,['pred'])).reset_index(drop=True)


In [64]:
recoms_wl = g.groupby('user_id')['movie_id'].apply(list)

In [65]:
# als
reco = []
corr = []

num_to_recom = 5
for user in correct_1:
    reco.append(recoms_wl[user_encode[user]])
    corr.append(correct_1[user])

metrics.mapk(reco, corr, 5)

0.020493670886075947

# inference

In [4]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')

user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]
train = train = user_hist_df[~user_hist_df['month'].isin([10, 11])]

In [5]:
user_hist_df['ts'] = pd.to_datetime(user_hist_df['ts'])
user_hist_df['month'] = user_hist_df['ts'].dt.month

test_1 = user_hist_df[user_hist_df['month'] == 6]
test_2 = user_hist_df[user_hist_df['month'] == 7]
user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]
train = train = user_hist_df[~user_hist_df['month'].isin([6, 7])]

In [6]:
train['event'] = 1
train = train[['user_id', 'movie_id', 'event']].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
user_encode = {u: i for i, u in enumerate(train['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train['movie_id'].unique())}

user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

n_users, n_items = len(user_encode), len(item_encode)

train['user_id'] = train['user_id'].apply(lambda x: user_encode[x])
train['movie_id'] = train['movie_id'].apply(lambda x: item_encode[x])

In [8]:
item_user = coo_matrix(
    (train['event'], (train['user_id'], train['movie_id'])), shape=(n_users, n_items))

user_item = coo_matrix(
    (train['event'], (train['movie_id'], train['user_id'])), shape=(n_items, n_users))

In [9]:
user_item.shape

(4063, 2055)

In [10]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=5, iterations= 15)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(user_item, show_progress=False)



In [11]:
movies_database = pd.read_csv('../data/raw/movies.csv')
useless_movies = movies_database[(movies_database['year'] < 2010) & 
                                 (~movies_database['id'].isin(list(item_encode.keys()))) &
                                 (movies_database['imdb_rating'] < 6)]['id'].values

useless_movies = [m for m in useless_movies if m in item_decode]

In [12]:
# als
res = model.recommend_all(item_user, N = 100, filter_items = useless_movies, show_progress=False)

In [13]:
recom = {}

for ind, rec in enumerate(res):
    recom[user_decode[ind]] = [item_decode[x] for x in rec]

# LGBM

In [14]:
user_features = model.user_factors

get item features

In [15]:
movies_database = pd.read_csv('../data/raw/movies.csv')

# movies_database = movies_database[movies_database['id'].isin(list(item_encode.keys()))]
item_features = movies_database[['id', 'year', 'genres', 'imdb_rating', 'tmdb_rating']]

def convert_year(val):
    if val < 2000:
        cat = 0
    elif 2000<=val < 2010:
        cat = 1
    else:
        cat = 2
    return cat

item_features['year'] = item_features['year'].apply(convert_year)

user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
movies = user_hist_df[user_hist_df['episode_id'] == 0]['movie_id'].unique()

item_features['movie'] = item_features['id'].apply(lambda x: 1 if x in movies else 0 )

test = item_features['genres'].fillna('hz').str.split(',')

mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(test),
                   columns=mlb.classes_,
                   index=test.index)

to_drop = ['Art House', 'Documentary', 'Family','Musical', 'Quarantine Info', 'hz', 'War']
res = res.drop(to_drop, 1)

item_features = pd.concat([item_features, res], 1).drop('genres', 1)

item_features = item_features.fillna(item_features.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
item_features = item_features.drop('movie', 1)

In [17]:
user_features_df = pd.DataFrame(user_features)
user_features_df['user_id'] = user_encode.values()

In [18]:
rank_df = pd.merge(train, user_features_df, how='left', on='user_id')

In [19]:
rank_df = pd.merge(rank_df, item_features, how = 'left', 
                   left_on ='movie_id', 
                   right_on='id')

In [20]:
rank_df.shape

(184263, 34)

In [21]:
rank_df = rank_df.dropna()

generate negative examples

In [22]:
user_hist = train.groupby('user_id')['movie_id'].apply(set)
users_to_predict = train['user_id'].unique()

In [23]:
movie_genres = movies_database[['id', 'genres']]
movie_genres['genres'] = movie_genres['genres'].str.split(',')
#movie_genres['id'] = movie_genres['id'].apply(lambda x: item_encode.get(x, x + 1000000000) )

movie_genres_dict = {}

for i in movie_genres.iterrows():
    movie_genres_dict[i[1][0]] = i[1][1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
movie_genres_dict = {k: v for k, v in movie_genres_dict.items() if isinstance(v, list)}

вот тут еще можно поиграться с кол вом жанров которые нужно учесть

In [25]:
user_pref = {}
for user in users_to_predict:
    #print(user)
    user_pref[user] = {}
    #print(user_hist[user])
    for film in user_hist[user]:
        film = item_decode[film]
        if movie_genres_dict.get(film):
            for genre in movie_genres_dict[film]:
                if user_pref[user].get(genre):
                    user_pref[user][genre] += 1
                else:
                    user_pref[user][genre] = 1
                    
    user_pref[user] = [key for key, val in Counter(user_pref[user]).most_common(5)]                



In [26]:
neg_examples = 50

In [27]:
user_negative = {}

for user in user_pref:
    negative = []
    current_user_pref = set(user_pref[user])
    
    num = 0
    while num < neg_examples:
        random_film = random.sample(list(movie_genres_dict), 1)[0]
        curr_film_genres = set(movie_genres_dict[random_film])
        if len(current_user_pref.intersection(curr_film_genres)) == 0:
            negative.append(random_film)
            num += 1
            
    user_negative[user] = negative
      

In [28]:
users = []
films = []


for user in user_negative:
    users += [user] * neg_examples
    films += user_negative[user]
    
    
negative_df = pd.DataFrame()
negative_df['user_id'] = users
negative_df['movie_id'] = films

In [29]:
negative_df = pd.merge(negative_df, user_features_df, how='left', on='user_id')

In [30]:
negative_df = pd.merge(negative_df, item_features, how = 'left', 
                       left_on ='movie_id', 
                       right_on='id')
negative_df['event'] = 0

In [31]:
negative_df.shape

(102750, 34)

In [32]:
rank_df.shape

(182074, 34)

In [33]:
#X = pd.concat([negative_df[:10000], rank_df[:10000]], 0)
X = pd.concat([negative_df[:], rank_df[:]], 0)

In [34]:
X.iloc[:, 11:] = X.iloc[:, 11:].astype('int8')
X.iloc[:, 2:7] = X.iloc[:, 2:7].astype('float16')

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.drop(['user_id', 'movie_id', 'id', 'event'], 1), 
                                                    X['event'], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [37]:
y_train = y_train[:100000]
X_train = X_train[:100000]

In [38]:
query_train = [X_train.shape[0]]
query_val = [X_val.shape[0]]
query_test = [X_test.shape[0]]

In [62]:
del gbm

In [39]:
gbm = lgb.LGBMRanker(max_depth = 5, n_estimators = 20, random_state = 42, 
                    n_jobs=4)

In [None]:
gbm.fit(X_train, y_train, group=query_train,
        eval_set=[(X_val, y_val)], eval_group=[query_val],
        eval_at=[5, 10], early_stopping_rounds=50)



validate

In [60]:
users = []
films = []


for user in recom:
    users += [user_encode[user]] * 100
    films += recom[user]
    
validate_df = pd.DataFrame()
validate_df['user_id'] = users
validate_df['movie_id'] = films

In [61]:
validate_df = pd.merge(validate_df, user_features_df, how='left', on='user_id')

validate_df = pd.merge(validate_df, item_features, how = 'left', 
                       left_on ='movie_id', 
                       right_on='id')


validate_df = validate_df.dropna()


In [62]:
test_pred = gbm.predict(validate_df.drop(['user_id', 'movie_id', 'id'], 1))
validate_df['pred'] = test_pred

In [63]:
g = validate_df.groupby(['user_id']).apply(lambda x: x.nlargest(5,['pred'])).reset_index(drop=True)


In [64]:
recoms_wl = g.groupby('user_id')['movie_id'].apply(list)

In [65]:
# als
reco = []
corr = []

num_to_recom = 5
for user in correct_1:
    reco.append(recoms_wl[user_encode[user]])
    corr.append(correct_1[user])

metrics.mapk(reco, corr, 5)

0.020493670886075947

In [263]:
user_hist_df = pd.read_csv('../data/raw/movies_dataset_10 months.csv')
user_hist_df = user_hist_df[user_hist_df['episode_id'] == 0]

user_hist_df['event'] = 1
train = user_hist_df[['user_id', 'movie_id', 'event']].drop_duplicates()

In [264]:
user_encode = {u: i for i, u in enumerate(train['user_id'].unique())}
item_encode = {u: i for i, u in enumerate(train['movie_id'].unique())}

user_decode = {v: k for k, v in user_encode.items()}
item_decode = {v: k for k, v in item_encode.items()}

n_users, n_items = len(user_encode), len(item_encode)

train['user_id'] = train['user_id'].apply(lambda x: user_encode[x])
train['movie_id'] = train['movie_id'].apply(lambda x: item_encode[x])

item_user = coo_matrix(
    (train['event'], (train['user_id'], train['movie_id'])), shape=(n_users, n_items))

user_item = coo_matrix(
    (train['event'], (train['movie_id'], train['user_id'])), shape=(n_items, n_users))

In [265]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=5)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(user_item, show_progress=False)

In [116]:
model = implicit.lmf.LogisticMatrixFactorization(factors = 5, random_state = 42)

model.fit(user_item, show_progress=False)

In [267]:
res = model.recommend_all(item_user, N = 10, filter_items = useless_movies, show_progress=False)

In [129]:
res = model.recommend_all(item_user, N = 5, show_progress=False)

In [268]:
recom = {}

for ind, rec in enumerate(res):
    recom[user_decode[ind]] = [item_decode[x] for x in rec]

In [269]:
import sys

PATH = '/Users/danil/Documents/github/sweet_RS/'
sys.path.append(str(PATH))

from src.utils import save_to_pickle

In [270]:
save_to_pickle(recom, '../data/processed/als_2_10.pickle')