In [19]:
import tqdm
import implicit
import lightfm
import pandas as pd 
import numpy as np
from implicit.bpr import BayesianPersonalizedRanking as BPR
from scipy import sparse
import orion_recommend
from orion_recommend.evaluate import metrics
import lightfm
import evall
from lightfm import cross_validation
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

In [2]:
# Functions
def metrics_user_rec(item, sim_users, k):
    count = 0
    for i, user_list in zip(item, sim_users):       
        for u in user_list:
            if ui_matrix[u,i] == 1:
                count = count + 1            
    p_at_k = round(count/(item.shape[0] * k), 4)

    RS = []
    ans = 0.0
    for i, user_list in zip(item, sim_users):           
        r=[]
        for u in user_list:
             r.append(ui_matrix[u][i])
        ans = ans + evall.ndcg_at_k(r, k, method=1)
        RS.append(r)
    G_at_k = ans/item.shape[0]
    M_at_k = evall.mean_average_precision(RS)
    return p_at_k, G_at_k, M_at_k

def metrics_item_rec(user, sim_items, k):
    count = 0
    for u, item_list in zip(user, sim_items):       
        for i in item_list:
            if ui_matrix[u,i] == 1:
                count = count + 1            
    p_at_k = round(count/(user.shape[0] * k), 4)

    RS = []
    ans = 0.0
    for u, item_list in zip(user, sim_items):           
        r=[]
        for i in item_list:
             r.append(ui_matrix[u][i])
        ans = ans + evall.ndcg_at_k(r, k, method=1)
        RS.append(r)
    G_at_k = ans/user.shape[0]
    M_at_k = evall.mean_average_precision(RS)
    return p_at_k, G_at_k, M_at_k


def test_model_item_user(model, test, k, hybrid, item_features, user_features):
    rec_dic = {}
    if hybrid ==False:
        with tqdm.tqdm(total=len(test_items)) as progress:
            for u in test_items:
                user_preds = model.predict(np.repeat(u,ui_matrixT.shape[1]),np.array([i for i in range(ui_matrixT.shape[1])]))
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)
    else:
        with tqdm.tqdm(total=len(test_items)) as progress:
            for u in test_items:
                user_preds = model.predict(np.repeat(u,ui_matrixT.shape[1]),np.array([i for i in range(ui_matrixT.shape[1])]),
                                           item_features = user_features, user_features = item_features)
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)

    targets =  np.array([i for i in rec_dic.keys()])
    recommended = np.array([i for i in rec_dic.values()])
    metrics = metrics_user_rec(targets, recommended, k)
    return metrics

def test_model(model, test, k, hybrid, item_features, user_features):
    rec_dic = {}
    if hybrid ==False:
        with tqdm.tqdm(total=len(test_users)) as progress:
            for u in test_users:
                user_preds = model.predict(np.repeat(u,ui_matrix.shape[1]),np.array([i for i in range(ui_matrix.shape[1])]))
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)
    else:
        with tqdm.tqdm(total=len(test_users)) as progress:
            for u in test_users:
                user_preds = model.predict(np.repeat(u,ui_matrix.shape[1]),np.array([i for i in range(ui_matrix.shape[1])]), user_features = user_features,
                                           item_features = item_features)
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)

    targets =  np.array([i for i in rec_dic.keys()])
    recommended = np.array([i for i in rec_dic.values()])
    metrics = metrics_item_rec(targets, recommended, k)
    return metrics

def test_model_item_user_nousers(model, test, k, hybrid, item_features):
    rec_dic = {}
    if hybrid ==False:
        with tqdm.tqdm(total=len(test_items)) as progress:
            for u in test_items:
                user_preds = model.predict(np.repeat(u,ui_matrixT.shape[1]),np.array([i for i in range(ui_matrixT.shape[1])]))
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)
    else:
        with tqdm.tqdm(total=len(test_items)) as progress:
            for u in test_items:
                user_preds = model.predict(np.repeat(u,ui_matrixT.shape[1]),np.array([i for i in range(ui_matrixT.shape[1])]),
                                            user_features = item_features)
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)

    targets =  np.array([i for i in rec_dic.keys()])
    recommended = np.array([i for i in rec_dic.values()])
    metrics = metrics_user_rec(targets, recommended, k)
    return metrics

def test_model_nousers(model, test, k, hybrid, item_features):
    rec_dic = {}
    if hybrid ==False:
        with tqdm.tqdm(total=len(test_users)) as progress:
            for u in test_users:
                user_preds = model.predict(np.repeat(u,ui_matrix.shape[1]),np.array([i for i in range(ui_matrix.shape[1])]))
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)
    else:
        with tqdm.tqdm(total=len(test_users)) as progress:
            for u in test_users:
                user_preds = model.predict(np.repeat(u,ui_matrix.shape[1]),np.array([i for i in range(ui_matrix.shape[1])]),
                                           item_features = item_features)
                top_k = np.argsort(user_preds)[-k:]
                rec_dic[u] = top_k
                progress.update(1)

    targets =  np.array([i for i in rec_dic.keys()])
    recommended = np.array([i for i in rec_dic.values()])
    metrics = metrics_item_rec(targets, recommended, k)
    return metrics


In [3]:
ui_matrix = np.load("fa_ui_matrix.npy")
ui_matrixT = ui_matrix.T

In [39]:
ua_matrix = np.load("new_fa_ua_matrix.npy")

In [4]:
train, test = cross_validation.random_train_test_split(sparse.coo_matrix(ui_matrix), test_percentage=0.2, random_state=None)

In [5]:
train_users = train.nonzero()[0]
test_users = test.nonzero()[0]
train_items = train.nonzero()[1]
test_items = test.nonzero()[1]

In [36]:
user_features = np.load("user_attributes_npc.npy")
user_features = sparse.coo_matrix(user_features)

item_features = np.load("new_fa_ia_matrix.npy")
item_features = sparse.coo_matrix(item_features)

# User-Item

In [55]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(train, user_features = user_features,
          item_features=item_features, epochs=100)

#mf1 = LightFM(loss='bpr')
#mf1.fit(train, epochs=100)

<lightfm.lightfm.LightFM at 0x7f32feb1c400>

In [39]:
metrics_bpr_20 = test_model(mf1,test,20, False, item_features, user_features)
pd.Series(metrics_bpr_20).to_csv("baselines_user-item/metrics_bpr_20.csv")

100%|██████████| 18884/18884 [00:45<00:00, 412.11it/s]


In [40]:
metrics_bpr_20

(0.005, 0.024074260751620957, 0.0073390213171257036)

In [41]:
metrics_bpr_10 = test_model(mf1,test,10, False, item_features, user_features)
pd.Series(metrics_bpr_10).to_csv("baselines_user-item/metrics_bpr_10.csv")

100%|██████████| 18884/18884 [00:45<00:00, 413.09it/s]


In [42]:
metrics_bpr_10

(0.0087, 0.027582650438547154, 0.01212330700865543)

In [56]:
metrics = test_model(model,test,10, True, item_features, user_features)
pd.Series(metrics).to_csv("baselines_user-item/metrics_hybrid_10_pretrain.csv")

100%|██████████| 18884/18884 [14:42<00:00, 21.41it/s]


In [57]:
metrics

(0.0053, 0.019302018222776864, 0.009440314830427994)

In [58]:
metrics = test_model(model,test,20, True, item_features, user_features)
pd.Series(metrics).to_csv("baselines_item-user/metrics_hybrid_20_pretrain.csv")

100%|██████████| 18884/18884 [14:45<00:00, 21.32it/s]


In [59]:
metrics

(0.0032, 0.016551289055875128, 0.005065788566381225)

# Item-User

In [54]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(train.T, user_features = item_features,
          item_features=user_features, epochs=100)

mf1 = LightFM(loss='bpr')
mf1.fit(train.T, epochs=100)

<lightfm.lightfm.LightFM at 0x7f6aba0489d0>

In [57]:
metrics_hybrid_20 = test_model_item_user(model,test,20, hybrid=True)
pd.Series(metrics_hybrid_20).to_csv("baselines_item-user/metrics_hybrid_20.csv")

100%|██████████| 18884/18884 [13:32<00:00, 23.24it/s]


In [58]:
metrics_hybrid_20

(0.0008, 0.0042951750676738995, 0.0018982227388538776)

In [59]:
metrics_hybrid_10 = test_model_item_user(model,test,10, hybrid=True)
pd.Series(metrics).to_csv("baselines_item-user/metrics_hybrid_10.csv")

100%|██████████| 18884/18884 [13:28<00:00, 23.35it/s]


In [60]:
metrics_hybrid_10

(0.0009, 0.004076232355578079, 0.002780871691677867)

In [61]:
metrics_bpr_20 = test_model_item_user(mf1,test,20, hybrid=False)
pd.Series(metrics_bpr_20).to_csv("baselines_item-user/metrics_bpr_20.csv")

100%|██████████| 18884/18884 [12:20<00:00, 25.49it/s]


In [62]:
metrics_bpr_20

(0.227, 0.42922481738052554, 0.2765190773955193)

In [63]:
metrics_bpr_10 = test_model_item_user(mf1,test,10, hybrid=False)
pd.Series(metrics_bpr_10).to_csv("baselines_item-user/metrics_bpr_10.csv")

100%|██████████| 18884/18884 [12:19<00:00, 25.52it/s]


In [64]:
metrics_bpr_10

(0.2397, 0.4574309635938121, 0.3546278444167544)

# Pre trained 

In [6]:
G_prod_embs = np.load("embeddings/Product_embs.npy")
G_brand_embs = np.load("embeddings/brand_embs.npy")
G_category_embs = np.load("embeddings/category_embs.npy")
G_colour_embs = np.load("embeddings/colour_embs.npy")
G_div_embs = np.load("embeddings/div_embs.npy")
G_itemcat_embs = np.load("embeddings/itemcat_embs.npy")
G_itemfam_embs = np.load("embeddings/itemfam_embs.npy")
G_season_embs = np.load("embeddings/season_embs.npy")


In [8]:
train_codes = np.load("new_fa_train_data.npy")
test_codes = np.load("new_fa_test_data.npy")

In [9]:
codes = np.concatenate([train_codes, test_codes],axis=0)

In [10]:
item_attributes = codes[:,6:]

In [11]:
brand_id = item_attributes[:,0]
category = item_attributes[:,1]
colour = item_attributes[:,2]
divisioncode = item_attributes[:,3]
itemcategorycode = item_attributes[:,4]
itemfamilycode  = item_attributes[:,5]
itemseason = item_attributes[:,6]
productgroup = item_attributes[:,7]

In [12]:
item_codes = np.concatenate([brand_id.reshape(-1,1), category.reshape(-1,1), colour.reshape(-1,1), divisioncode.reshape(-1,1),\
                itemcategorycode.reshape(-1,1), itemfamilycode.reshape(-1,1), itemseason.reshape(-1,1), productgroup.reshape(-1,1) ], axis=1)

In [13]:
item_codes = np.unique(item_codes,axis=0)

In [14]:
embs = [G_prod_embs[item_codes[:,7]],
G_brand_embs[item_codes[:,0]],
G_category_embs[item_codes[:,1]],
G_colour_embs[item_codes[:,2]],
G_div_embs[item_codes[:,3]],
G_itemcat_embs[item_codes[:,4]],
G_itemfam_embs[item_codes[:,5]],
G_season_embs[item_codes[:,6]]]

In [15]:
gan_item_features = np.concatenate(embs, axis=1)


In [16]:
sparse_feats = sparse.csr_matrix(gan_item_features)

In [78]:
np.savetxt("embeddings/item_embeddings_50", gan_item_features)

In [140]:
from lightfm.pretrain_lightfm import LightFM as ptmod

ptmodel= ptmod(loss='warp',no_components=400, item_pretrain=True, item_pretrain_file="embeddings/item_embeddings_50")
ptmodel.fit(train, epochs=100)


Pre-Train Item Embedding Lunch.
Pre-Train Item Embedding Finished.


<lightfm.pretrain_lightfm.LightFM at 0x7f32fd77bee0>

In [163]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(train.T, user_features = item_features, epochs=100)

<lightfm.lightfm.LightFM at 0x7f32fd77b370>

In [164]:
model.user_embeddings.shape

(472, 10)

In [165]:
model.item_embeddings.shape

(86297, 10)

In [168]:
model.user_embeddings = gan_item_features

In [169]:
metrics_hybrid_20 = test_model_item_user_nousers(model,test,20, True, sparse_feats)
#pd.Series(metrics_hybrid_20).to_csv("baselines_item-user/metrics_hybrid_20.csv")

100%|██████████| 18884/18884 [2:06:38<00:00,  2.49it/s]  


In [170]:
metrics_hybrid_20

(0.0005, 0.0031449086716142743, 0.0014489530873678807)

In [23]:
metrics_hybrid_10 = test_model_item_user_nousers(model,test,10, hybrid=True, item_features= sparse_feats)
#pd.Series(metrics).to_csv("baselines_item-user/metrics_hybrid_10.csv")

100%|██████████| 18884/18884 [1:49:25<00:00,  2.88it/s]


In [24]:
metrics_hybrid_10

(0.0002, 0.0007657937093469053, 0.00043463292882347875)