In [1]:
import os
import pandas as pd
import numpy as np
from export_modules import *
from modules.WRMF import *
from tqdm.auto import tqdm

# Metrics

In [2]:
%load_ext Cython

In [3]:
%%cython

def average_precision(
        dict data_true,
        dict data_predicted,
        const unsigned long int k
) -> float:
    cdef:
        unsigned long int n_items_predicted
        unsigned long int n_items_true
        unsigned long int n_correct_items
        unsigned long int item_idx

        double average_precision_sum
        double precision

        set items_true
        list items_predicted

    if not data_true:
        raise ValueError('data_true is empty')

    average_precision_sum = 0.0

    for key, items_true in data_true.items():
        items_predicted = data_predicted.get(key, [])

        n_items_true = len(items_true)
        n_items_predicted = min(len(items_predicted), k)

        if n_items_true == 0 or n_items_predicted == 0:
            continue

        n_correct_items = 0
        precision = 0.0

        for item_idx in range(n_items_predicted):
            if items_predicted[item_idx] in items_true:
                n_correct_items += 1
                precision += <double>n_correct_items / <double>(item_idx + 1)

        average_precision_sum += <double>precision / <double>min(n_items_true, k)

    return average_precision_sum / <double>len(data_true)

def metric(true_data, predicted_data, k=20):
    true_data_set = {k: set(v) for k, v in true_data.items()}

    return average_precision(true_data_set, predicted_data, k=k)

# Pathes

In [4]:
data_path = os.getcwd() 
data_path += "/project_6/data/implicit/"

In [5]:
mymedialite_folder = "/home/administrator/libraries/MyMediaLite"

train_path = data_path + "all_datasets/80_10_10/train.csv"
validation_path = data_path + "all_datasets/80_10_10/validation.csv"
test_path = data_path + "all_datasets/80_10_10/test.csv"

path_v="/home/administrator/workplace/work/project_6/80_10_10_validation.csv"
path_t="/home/administrator/workplace/work/project_6/80_10_10_test.csv"

cache_folder="/home/administrator/workplace/work/project_6"+"/cache"

# WRMF for validation part

In [6]:
def validation(mymedialite_folder, train_path, cache_folder, 
               validation_path, path_v,
               prediction_path="/home/administrator/workplace/work/project_6/80_10_10_validation.csv", k=20):
    print("WRMF work starts")
    wrmf = ImplicitRecommendation(path_to_mymedialite=mymedialite_folder,
                                train_path=train_path, 
                                cache_folder=cache_folder, random_seed=42)
    wrmf.train("WRMF", k)
    wrmf.predict(test_path=validation_path, prediction_path=prediction_path, k=k)
    print("Read Data")
    prediction = pd.read_csv(path_v, names=["user", "item", "score"])
    validation = pd.read_csv(validation_path, names=["user", "item", "impl", "is_relevant", "time"])
    validation_for_metrics = validation.loc[validation.is_relevant.astype("bool"), :]
    del validation
    print("Convert test&prediction")
    validation_dict = {}
    for user in tqdm(list(validation_for_metrics.user.unique())):
        validation_dict[user] = set(validation_for_metrics.loc[validation_for_metrics.user == user, "item"])
        
    prediction_dict = {}
    for user in tqdm(list(validation_for_metrics.user.unique())):
        prediction_dict[user] = list(prediction.loc[prediction.user == user, "item"])
    
    del prediction, validation_for_metrics
    print("Compute metrics")
    return average_precision(validation_dict, prediction_dict, k=k)

# 80/10/10

## WRMF for validation part¶

In [7]:
val_val=validation(mymedialite_folder, train_path, cache_folder, 
                   validation_path, path_v,
                   prediction_path="/home/administrator/workplace/work/project_6/80_10_10_validation.csv", k=20)

WRMF work starts
Read Data
Convert test&prediction


HBox(children=(IntProgress(value=0, max=162009), HTML(value='')))




HBox(children=(IntProgress(value=0, max=162009), HTML(value='')))


Compute metrics


In [8]:
val_val

0.025957563819339995

## WRMF for test part

### Train + validation

In [17]:
train = pd.read_csv(train_path, names=["user", "item", "impl", "is_relevant", "time"])
new_train = pd.read_csv(validation_path, names=["user", "item", "impl", "is_relevant", "time"])
new_train = pd.concat([train, new_train])
new_train.to_csv(data_path + "all_datasets/80_10_10/train_val.csv", header=False, index=False)


In [None]:
new_train.head(1).append(new_train.tail(2))

In [25]:
del train, new_train

In [9]:
val_test1 = validation(mymedialite_folder, train_path=data_path + "all_datasets/80_10_10/train_val.csv", 
           cache_folder=cache_folder, 
           validation_path=test_path, path_v=path_t,
           prediction_path="/home/administrator/workplace/work/project_6/80_10_10_test.csv", k=20)

WRMF work starts
Read Data
Convert test&prediction


HBox(children=(IntProgress(value=0, max=154060), HTML(value='')))




HBox(children=(IntProgress(value=0, max=154060), HTML(value='')))


Compute metrics


In [12]:
(val_val + val_test1)/2

0.026293379065953654

### Only train

In [13]:
val_test2 = validation(mymedialite_folder, train_path=train_path, 
           cache_folder=cache_folder, 
           validation_path=test_path, path_v=path_t,
           prediction_path="/home/administrator/workplace/work/project_6/80_10_10_test.csv", k=20)

WRMF work starts
Read Data
Convert test&prediction


HBox(children=(IntProgress(value=0, max=154060), HTML(value='')))




HBox(children=(IntProgress(value=0, max=154060), HTML(value='')))


Compute metrics


In [14]:
val_test2 

0.01821060960529108

In [17]:
(val_val + val_test2)/2

0.022084086712315536

In [18]:
(val_val + val_test1)/2

0.026293379065953654

# 96/2/2

In [19]:
train_path = data_path + "all_datasets/96_2_2/train.csv"
validation_path = data_path + "all_datasets/96_2_2/validation.csv"
test_path = data_path + "all_datasets/96_2_2/test.csv"

path_v="/home/administrator/workplace/work/project_6/96_2_2_validation.csv"
path_t="/home/administrator/workplace/work/project_6/96_2_2_test.csv"

## WRMF for validation part

In [20]:
val_val96=validation(mymedialite_folder, train_path, cache_folder, 
                     validation_path, path_v,
                     prediction_path="/home/administrator/workplace/work/project_6/96_2_2_validation.csv", k=20)

WRMF work starts
Read Data
Convert test&prediction


HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))




HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))


Compute metrics


In [21]:
val_val96

0.02793314639531309

## WRMF for test part

### Train + validation

In [None]:
train = pd.read_csv(train_path, names=["user", "item", "impl", "is_relevant", "time"])
new_train = pd.read_csv(validation_path, names=["user", "item", "impl", "is_relevant", "time"])
new_train = pd.concat([train, new_train])
new_train.to_csv(data_path + "all_datasets/96_2_2/train_val.csv", header=False, index=False)

In [24]:
val_test196 = validation(mymedialite_folder, 
                       train_path=data_path + "all_datasets/96_2_2/train_val.csv", 
                       cache_folder=cache_folder, 
                       validation_path=test_path, path_v=path_t,
                       prediction_path="/home/administrator/workplace/work/project_6/96_2_2_test.csv", k=20)

WRMF work starts
Read Data
Convert test&prediction


HBox(children=(IntProgress(value=0, max=58391), HTML(value='')))




HBox(children=(IntProgress(value=0, max=58391), HTML(value='')))


Compute metrics


In [25]:
val_test196

0.027253747198353804

In [28]:
(val_val96 + val_test196)/2

0.027593446796833447

### Only train

In [29]:
val_test296 = validation(mymedialite_folder, 
                         train_path=train_path, 
                         cache_folder=cache_folder, 
                         validation_path=test_path, path_v=path_t,
                         prediction_path="/home/administrator/workplace/work/project_6/96_2_2_validation.csv", k=20)

WRMF work starts
Read Data
Convert test&prediction


HBox(children=(IntProgress(value=0, max=58391), HTML(value='')))




HBox(children=(IntProgress(value=0, max=58391), HTML(value='')))


Compute metrics


In [30]:
val_test296

0.027253747198353804

In [31]:
(val_val96 + val_test296)/2

0.027593446796833447

# Make submision

In [79]:
test_path="/home/administrator/workplace/work/project_6/data/okko/test.csv"

Concatenate all data

In [13]:
train = pd.read_csv(train_path, names=["user", "item", "impl", "is_relevant", "time"])

In [14]:
validation = pd.read_csv(validation_path, names=["user", "item", "impl", "is_relevant", "time"])

In [15]:
test = pd.read_csv(test_path, names=["user", "item", "impl", "is_relevant", "time"])

In [16]:
big_train = pd.concat([train, validation, test])

In [17]:
big_train.shape

(10403324, 5)

In [19]:
train.shape[0] + test.shape[0] + validation.shape[0]

10403324

In [21]:
del train, test, validation

In [20]:
big_train.to_csv("/home/administrator/workplace/work/project_6/data/implicit/all_datasets/96_2_2/all_for_train.csv",
                 header=False, index=False)

In [22]:
del big_train

In [23]:
big_train_path="/home/administrator/workplace/work/project_6/data/implicit/all_datasets/96_2_2/all_for_train.csv"

Train model and make prediction for earlier known users

In [24]:
wrmf = ImplicitRecommendation(path_to_mymedialite=mymedialite_folder,
                              train_path=big_train_path, 
                              cache_folder=cache_folder, random_seed=42)
wrmf.train("WRMF", 20)

In [28]:
wrmf.predict(test_path=test_path, 
         prediction_path="/home/administrator/workplace/work/project_6/WRMtest.csv", k=20)

In [29]:
prediction = pd.read_csv("/home/administrator/workplace/work/project_6/WRMtest.csv", names=["user", "item", "score"])
prediction.head()

Unnamed: 0,user,item,score
0,0,9491,0.841544
1,0,2327,0.622981
2,0,3137,0.621951
3,0,1650,0.611081
4,0,8040,0.589813


BMF

In [30]:
from modules.BMF import *

for new user (validation)

In [31]:
train = pd.read_csv(big_train_path, names=["user", "item", "impl", "is_relevant", "time"])
train.head(1)

Unnamed: 0,user,item,impl,is_relevant,time
0,283774,6189,1,1,41730630.0


In [32]:
test_path

'/home/administrator/workplace/work/project_6/data/okko/test.csv'

In [34]:
test = pd.read_csv(test_path, names=["user", "item", "impl"])
test.head(1)

Unnamed: 0,user,item,impl
0,0,1,1


In [42]:
new_users = set(test.user.astype("int")) - set(train.user.astype("int"))

In [43]:
new_users

set()

In [44]:
del train, test

In [36]:
new_users = pd.DataFrame(list(new_users), columns=["user"])

In [None]:
new_users["item"] = 1
new_users["rating"] = 1

In [24]:
new_users.to_csv("/home/administrator/workplace/work/project_6/new_users.csv", 
                 index=False, header=False, sep="\t")

In [26]:
del new_users, train, validation

In [29]:
bmf1 = BMF(train_path, mymedialite_folder, cache_folder)
bmf1.clean_before_rec()

In [30]:
bmf1.train(parameters=dict(random_seed=42, num_factors=16, bias_reg=0.01, reg_u_i=0.015,
                              frequency_regularization=False, learn_rate=0.01, bias_learn_rate=1,
                              num_iter=30, bold_driver=False, loss="RMSE",
                              naive_parallelization=False))

The data was successfully prepared for training
The model was successfully trained


In [None]:
rec_nc = bmf1.recommend(test_path, "/home/administrator/workplace/work/project_6/BMFnew.csv", 20)

Conver to submission format

In [45]:
prediction.head()

Unnamed: 0,user,item,score
0,0,9491,0.841544
1,0,2327,0.622981
2,0,3137,0.621951
3,0,1650,0.611081
4,0,8040,0.589813


In [46]:
prediction_to_submit = {}
for user in prediction.user.unique():
    prediction_to_submit[str(user)] = list(prediction.loc[prediction.user == user, "item"])

In [47]:
prediction_to_submit

{'0': [9491,
  2327,
  3137,
  1650,
  8040,
  4601,
  7449,
  3757,
  828,
  5595,
  3947,
  427,
  9817,
  4070,
  1326,
  6955,
  2694,
  9412,
  1636,
  3230],
 '131072': [6127,
  1016,
  905,
  3916,
  2714,
  10084,
  5035,
  9491,
  4290,
  9443,
  828,
  1227,
  1577,
  7079,
  4601,
  747,
  8436,
  4548,
  3137,
  8237],
 '2': [2714,
  3336,
  1521,
  6209,
  3916,
  6195,
  153,
  813,
  2245,
  6573,
  2468,
  5201,
  6127,
  2257,
  5845,
  1016,
  1539,
  9701,
  4141,
  549],
 '262145': [4441,
  6409,
  9179,
  3045,
  546,
  1521,
  1570,
  6236,
  229,
  813,
  5554,
  6195,
  9406,
  6213,
  10061,
  3336,
  1538,
  1653,
  6127,
  3856],
 '524291': [3567,
  2639,
  72,
  5405,
  427,
  8215,
  5644,
  9467,
  3256,
  7185,
  9089,
  8501,
  102,
  3947,
  6606,
  7233,
  2245,
  51,
  5616,
  3101],
 '131077': [72,
  9472,
  427,
  5644,
  4141,
  1636,
  3230,
  8501,
  8215,
  3567,
  1681,
  5405,
  9089,
  7233,
  102,
  4070,
  2030,
  3256,
  5541,
  4519],
 '6

In [49]:
data_path

'/home/administrator/workplace/work/project_6/data/implicit/'

In [50]:
with open("/home/administrator/workplace/work/project_6/subm1.json", 'w') as f:
    json.dump(prediction_to_submit, f)