### **Data Preprocess**

In [None]:
import pandas as pd
import os
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [None]:
os.chdir('./data')

In [None]:
user_itemset_train = pd.read_csv('user_itemset_training.csv', header=None)
user_itemset_train.columns = ['user','itemset']

user_itemset_valid = pd.read_csv('user_itemset_valid_query.csv', header=None)
user_itemset_valid.columns = ['user','itemset']

user_itemset_test = pd.read_csv('user_itemset_test_query.csv')
user_itemset_test.columns = ['user','itemset']

In [None]:
user_itemset_valid_answer = pd.read_csv('user_itemset_valid_answer.csv', header=None)
user_itemset_valid_answer

Unnamed: 0,0
0,1
1,0
2,1
3,0
4,1
...,...
335935,0
335936,1
335937,0
335938,1


In [None]:
user_itemset_train2 = user_itemset_train.copy()

In [None]:
user_itemset_train['itemset'] = ['Item'+str(i) for i in user_itemset_train['itemset']]
train_user = set(user_itemset_train['user'])
train_itemset = set(user_itemset_train['itemset'])

In [None]:
user_itemset_train2

Unnamed: 0,user,itemset
0,41813,9149
1,48831,20181
2,43284,23209
3,24833,3285
4,7304,22173
...,...,...
1343763,15391,20491
1343764,43856,21623
1343765,52852,5935
1343766,36640,7656


In [None]:
col_matrix = np.zeros((len(train_user),len(train_itemset)))
for i in range(len(user_itemset_train2)):
    col_matrix[user_itemset_train2['user'][i],user_itemset_train2['itemset'][i]]=1

In [None]:
np.count_nonzero(col_matrix)==len(user_itemset_train2)

True

In [None]:
B2 = nx.Graph()
B2.add_nodes_from(train_user,bipartite=0)
B2.add_nodes_from(train_itemset, bipartite=1)

for i in range(len(user_itemset_train)):
    B2.add_edge(user_itemset_train['user'][i], user_itemset_train['itemset'][i])

#pickle.dump(B2, open('B2.pickle', 'wb'))

### **Apply Node2Vec**

In [None]:
from node2vec import Node2Vec

# FILES
EMBEDDING_FILENAME = './embeddings-b2.emb'
EMBEDDING_MODEL_FILENAME = './embeddings-b2.model'

# Precompute probabilities and generate walks
node2vec = Node2Vec(B2, dimensions=128, walk_length=10, num_walks=30, workers=20)

# Embed
model = node2vec.fit(window=10, min_count=1, batch_words=20)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

Computing transition probabilities:   0%|          | 0/81591 [00:00<?, ?it/s]

Generating walks (CPU: 6): 100%|██████████| 2/2 [00:20<00:00, 10.44s/it]
Generating walks (CPU: 7): 100%|██████████| 2/2 [00:20<00:00, 10.28s/it]
Generating walks (CPU: 8): 100%|██████████| 2/2 [00:20<00:00, 10.37s/it]
Generating walks (CPU: 9): 100%|██████████| 2/2 [00:20<00:00, 10.29s/it]
Generating walks (CPU: 10): 100%|██████████| 2/2 [00:20<00:00, 10.37s/it]
Generating walks (CPU: 11): 100%|██████████| 1/1 [00:10<00:00, 10.31s/it]
Generating walks (CPU: 12): 100%|██████████| 1/1 [00:10<00:00, 10.45s/it]
Generating walks (CPU: 13): 100%|██████████| 1/1 [00:10<00:00, 10.12s/it]
Generating walks (CPU: 14): 100%|██████████| 1/1 [00:10<00:00, 10.31s/it]
Generating walks (CPU: 15): 100%|██████████| 1/1 [00:10<00:00, 10.22s/it]
Generating walks (CPU: 16): 100%|██████████| 1/1 [00:10<00:00, 10.38s/it]
Generating walks (CPU: 18): 100%|██████████| 1/1 [00:10<00:00, 10.18s/it]

Generating walks (CPU: 19): 100%|██████████| 1/1 [00:10<00:00, 10.35s/it]
Generating walks (CPU: 20): 100%|████████

## **Load the best Node2Vec model**

In [None]:
from gensim.models import Word2Vec
#from node2vec import Node2Vec

EMBEDDING_FILENAME = './embeddings-b2.emb'
EMBEDDING_MODEL_FILENAME = './embeddings-b2.model'

# Load model after Node2Vec.save
model = Word2Vec.load(EMBEDDING_MODEL_FILENAME)

# Load model after Node2Vec.wv.save_word2vec_format
#model.wv = Word2Vec.wv.load_word2vec_format(EMBEDDING_FILENAME)

In [None]:
dict(model.wv.most_similar(user_itemset_valid['user'][0])).keys()

dict_keys(['Item16759', '20630', 'Item1652', '40275', 'Item24971', 'Item19603', 'Item25153', 'Item14362', '13922', 'Item16387'])

In [None]:
dict(model.wv.most_similar(user_itemset_valid['itemset'][0])).keys()

dict_keys(['Item5754', 'Item16411', 'Item10182', 'Item18481', 'Item14771', 'Item11457', 'Item4957', '45068', 'Item3986', 'Item21289'])

### - Find the items

In [None]:
def predict(x,y,corr,user_n,item_n):
    y_hat = 0
    similar_with_user = list(dict(model.wv.most_similar(x,topn=user_n)).keys())
    similar_with_itemset = list(dict(model.wv.most_similar('Item'+str(y),topn=item_n)).keys())
    similar_list = similar_with_user + similar_with_itemset
    #print(similar_list)

    for i in range(len(similar_list)):
        if similar_list[i].startswith('I'): #Itemset
            target = int(similar_list[i].replace('Item',''))
            y_hat += corr[x,target]
            #print('User:',x,'target:',target,'Cor',corr[x,target])
        else:
            y_hat += corr[int(similar_list[i]),y]
            #print('target:',int(similar_list[i]),'Itemset:',y,'Cor',corr[int(similar_list[i]),y])

    if y_hat > (user_n + item_n)*0.2:
        y = 1
    else:
        y = 0
    return y

### For validation set

In [None]:
from tqdm import tqdm
y_hat_list1 = [] #25,50

for i in tqdm(range(len(user_itemset_valid))):
    y_hat_list1.append(predict(user_itemset_valid['user'][i],user_itemset_valid['itemset'][i],col_matrix,50,100))

In [None]:
result1 = sum(np.equal(list(user_itemset_valid_answer[0]),y_hat_list1))/len(user_itemset_valid_answer)

### For test set

In [None]:
from tqdm import tqdm
y_hat_list= [] #50,100

for i in tqdm(range(len(user_itemset_test))):
    y_hat_list.append(predict(user_itemset_test['user'][i],user_itemset_test['itemset'][i],col_matrix,50,100))

  7%|██▏                               | 21905/335940 [08:41<2:06:29, 41.38it/s]

In [None]:
df = pd.DataFrame(y_hat_list)
df.to_csv('user_item_test_prediction.csv',header=False, index=False)