In [None]:
import numpy as np
import pandas as pd
import csv
import h5py
import os
import requests
import tempfile
import time

import scann

In [None]:
input_id =3 #0,1,2,3

In [None]:
def Read_MFdata():
    file_name = '../dataset/dataset_MF200/'
    if input_id ==0:
        file_name += "netflix_mf-200.txt"
    elif input_id ==1:
        file_name += "amazon_Movies_and_TV_mf-200.txt"
    elif input_id ==2:
        file_name += "amazon_Kindle_Store_mf-200.txt"
    elif input_id ==3:
        file_name += "MovieLens_mf-200.txt"
        
    list_p = []
    list_q = []
    item_id_list = []
    
    with open(file_name) as f:
        for line in f:
            line = line.rstrip()  
            line_tmp = line.split() 
            if line_tmp[1] =="T":
                if line_tmp[0][0] == "p":
                    del line_tmp[0:2]
                    for i in range(len(line_tmp)):
                        line_tmp[i] = float(line_tmp[i])
                    list_p.append(line_tmp)
                    
                elif line_tmp[0][0] == "q":
                    item_id_list.append(int(line_tmp[0][1:]))
                    del line_tmp[0:2]
                    for i in range(len(line_tmp)):
                        line_tmp[i] = float(line_tmp[i])
                    list_q.append(line_tmp)
                
    data_user = np.array(list_p)
    data_item = np.array(list_q)
    return data_user, data_item, item_id_list

In [None]:
def Read_i2v_data():
    file_name = '../dataset/dataset_item2vec/'
    if input_id ==0:
        file_name += "netflix_item2vec_d-200.txt"
    elif input_id ==1:
        file_name += "amazon_Movie_item2vec_d-200.txt"
    elif input_id ==2:
        file_name += "amazon_Kindle_item2vec_d-200.txt"
    elif input_id ==3:
        file_name += "MovieLens_item2vec_d-200.txt"
        
    item_data = []
    
    with open(file_name) as f:
        for line in f:
            line = line.rstrip()  
            line_tmp = line.split() 
            del line_tmp[0]
            for i in range(len(line_tmp)):
                line_tmp[i] = float(line_tmp[i])
            item_data.append(line_tmp)

    
    return np.array(item_data)

In [None]:
def Read_para():
    file_name = '../parameter/k.txt'
    with open(file_name) as f:
        for line in f:
            line = line.rstrip()  
            line_tmp = line.split() 
            k = int(line_tmp[0])
            
    file_name = '../parameter/lamda.txt'
    with open(file_name) as f:
        for line in f:
            line = line.rstrip()  
            line_tmp = line.split() 
            lamda = float(line_tmp[0])

    
    return k, lamda

In [None]:
def Read_forcus_users():
    file_name = './forcus_users/'
    if input_id ==0:
        file_name += "netflix.csv"
    elif input_id ==1:
        file_name += "amazon_M.csv"
    elif input_id ==2:
        file_name += "amazon_K.csv"
    elif input_id ==3:
        file_name += "MovieLens.csv"
        
    forcus_user = []
    
    with open(file_name) as f:
        for line in f:
            line = line.rstrip()  
            line_tmp = line.split(",") 
            forcus_user.append(int(float(line_tmp[0])))
          
    return forcus_user

In [None]:
def get_scale(data):
    max_scalar_vec = np.max(data, axis=0)
    min_scalar_vec = np.min(data, axis=0)

    tmp_list = []
    for i in range(data.shape[1]):
        tmp_list.append(max_scalar_vec[i] - min_scalar_vec[i])
        
    X = np.array(tmp_list)
    return np.linalg.norm(X, ord=2)


In [None]:
def compute_min_dist(data, index_vec):
    dist_list = []
    for i in range(len(index_vec)):
        left = index_vec[i]
        for j in range(i+1, len(index_vec)):
            right = index_vec[j]
            dist_list.append(np.linalg.norm(data[left]-data[right]))
            
    return min(dist_list)

In [None]:
#input MF data
user_data, item_mf_data, item_id_list = Read_MFdata()

In [None]:
#input i2v data
item_i2v_data= Read_i2v_data()

In [None]:
#input parameters
k, lamda = Read_para()

In [None]:
#input forcus user
forcus_user_list = Read_forcus_users()

In [None]:
#compute scale
scale = get_scale(item_i2v_data)
print(scale)
scale = float(5/scale)
print(scale)

In [None]:
import math
tmp_leaves = int(math.sqrt(item_mf_data.shape[0]))
tmp_leaves_search =int(tmp_leaves/10) 
train_number = 2500
print(tmp_leaves)
print(tmp_leaves_search)

In [None]:
searcher = scann.scann_ops_pybind.builder(item_mf_data, 10, "dot_product").tree(
    num_leaves= tmp_leaves, num_leaves_to_search= tmp_leaves_search, training_sample_size= train_number).score_ah(
    2, anisotropic_quantization_threshold=0.2).build()


In [None]:
if input_id==0:
    os.makedirs('.scann_artefacts/netflix/wo1/', exist_ok=True)
    # serialize the searcher
    searcher.serialize('.scann_artefacts/netflix/wo1/')
elif input_id==1:
    os.makedirs('.scann_artefacts/amazon_M/wo1/', exist_ok=True) 
    searcher.serialize('.scann_artefacts/amazon_M/wo1/') 
elif input_id==2:
    os.makedirs('.scann_artefacts/amazon_K/wo/', exist_ok=True)
    searcher.serialize('.scann_artefacts/amazon_K/wo/') 
elif input_id==3:
    os.makedirs('.scann_artefacts/MovieLens/wo/', exist_ok=True)
    searcher.serialize('.scann_artefacts/MovieLens/wo/') 


In [None]:
result_id = []
result_index = []
result_time =[]
result_dist_min = []
result_ip_sum = []
result_score = []
ip_list =[]
for i in range(len(forcus_user_list)):
    query_id = forcus_user_list[i]
    
    start = time.time()
    neighbors, distances = searcher.search(user_data[query_id], final_num_neighbors= k)
    end = time.time()
    
    ip=0
    for j in range(k): 
        ip_list.append(np.dot(user_data[query_id], item_mf_data[neighbors[j]]) )
        ip += np.dot(user_data[query_id], item_mf_data[neighbors[j]]) 
    result_ip_sum.append(ip)
    result_index.append(neighbors)
    result_time.append(1000*(end - start))

#get true item_id
for i in range (len(result_index)):
    tmp = []
    for j in range(len(result_index[i])):
        tmp.append( item_id_list[result_index[i][j]] )
    result_id.append(tmp)

#get min_dist
for i in range(len(forcus_user_list)):
    result_dist_min.append( compute_min_dist(item_i2v_data, result_index[i]))
    
#get score
for i in range(len(forcus_user_list)):
    tmp = float((lamda * result_ip_sum[i])/k) + scale * (1-lamda) * result_dist_min[i]
    result_score.append(tmp)
    
#get result
result = []
for i in range(len(forcus_user_list)):
    tmp =[]
    tmp.append(forcus_user_list[i])
    tmp.append(result_score[i])
    #tmp.append(result_ip_sum[i])
    tmp.append(result_dist_min[i])
    tmp.append(result_time[i])
    tmp.extend(result_id[i])
    result.append(tmp)

In [None]:
def Output_Result(data):
    file_name = "./"
    if input_id ==0:
        file_name += "./netflix200_k-" + str(k) + "_lam-" + str(lamda) + ".csv"
    elif input_id ==1:
        file_name += "./amazon_M200_k" + str(k) + "_lam-" + str(lamda) + ".csv"
    elif input_id ==2:
        file_name += "./amazon_K200_k" + str(k) + "_lam-" + str(lamda) + ".csv"
    elif input_id ==3:
        file_name += "./MovieLens200_k" + str(k) + "_lam-" + str(lamda) + ".csv"
        
    with open(file_name, 'w', newline= '') as f:
        writer = csv.writer(f)
        writer.writerows(data)

In [None]:
Output_Result(result)