In [1]:
from export_modules import *
from modules.wrmf.WRMF import *
from tqdm.auto import tqdm
%load_ext Cython

# Metrics

In [2]:
%%cython

def average_precision(
        dict data_true,
        dict data_predicted,
        const unsigned long int k
) -> float:
    cdef:
        unsigned long int n_items_predicted
        unsigned long int n_items_true
        unsigned long int n_correct_items
        unsigned long int item_idx

        double average_precision_sum
        double precision

        set items_true
        list items_predicted

    if not data_true:
        raise ValueError('data_true is empty')

    average_precision_sum = 0.0

    for key, items_true in data_true.items():
        items_predicted = data_predicted.get(key, [])

        n_items_true = len(items_true)
        n_items_predicted = min(len(items_predicted), k)

        if n_items_true == 0 or n_items_predicted == 0:
            continue

        n_correct_items = 0
        precision = 0.0

        for item_idx in range(n_items_predicted):
            if items_predicted[item_idx] in items_true:
                n_correct_items += 1
                precision += <double>n_correct_items / <double>(item_idx + 1)

        average_precision_sum += <double>precision / <double>min(n_items_true, k)

    return average_precision_sum / <double>len(data_true)

def metric(true_data, predicted_data, k=20):
    true_data_set = {k: set(v) for k, v in true_data.items()}

    return average_precision(true_data_set, predicted_data, k=k)

# WRMF validation functions

In [3]:
def WRMF(mymedialite_folder, train_path, 
         validation_path, prediction_path, cache_folder, 
         k=20, random_seed=42, 
         parameters=dict(num_factors=20, regularization=0.015, alpha=1, num_iter=15)):
    print("WRMF work starts")
    num_factors, regularization, alpha, num_iter=parameters["num_factors"], parameters["regularization"], parameters["alpha"], parameters["num_iter"]
    wrmf = ImplicitRecommendation(path_to_mymedialite=mymedialite_folder,
                                  train_path=train_path,
                                  cache_folder=cache_folder, 
                                  random_seed=random_seed)
    print("Train WRMF")
    wrmf.train(recommender="WRMF", k=k, 
               num_factors=num_factors, 
               regularization=regularization, alpha=alpha, num_iter=num_iter)
    print("Predict by WRMF")
    df = wrmf.predict(test_path=validation_path, 
                      save_results=prediction_path, k=k)
    del df

In [4]:
def to_set(df):
    dict_val_rel={}
    for user in tqdm(df.user.unique()):
        dict_val_rel[user] = set(df.loc[df.user == user, "item"])
    return dict_val_rel

In [5]:
def to_list(df):
    dict_pred={}
    for user in tqdm(df.user.unique()):
        dict_pred[user] = list(df.loc[df.user == user, "item"].astype("int"))
    return dict_pred

In [6]:
def validation(validation_path, prediction_path, k,
               names=["user", "item","is_relevant", "impl", "time"]):
    val = pd.read_csv(validation_path, names=names)
    val_rel = val[val.is_relevant.astype("bool")]
    unique_from_val = val_rel.user.unique()
    
    print("Test converting")
    dict_val_rel = to_set(val_rel)
    del val_rel
    
    print("Prediction converting")
    pred = pd.read_csv(prediction_path)
    pred = pred[pred.user.isin(unique_from_val)]
    dict_pred = to_list(pred)
    del pred
    
    return average_precision(dict_val_rel, dict_pred, k)

# Pathes

In [7]:
data_path = os.getcwd() 
data_path += "/project_6/data/implicit/"

In [8]:
mymedialite_folder = "/home/administrator/libraries/MyMediaLite"

train_path = data_path + "all_datasets/96_2_2mixcol/train.csv"
validation_path = data_path + "all_datasets/96_2_2mixcol/validation.csv"
test_path = data_path + "all_datasets/96_2_2mixcol/test.csv"

path_v="/home/administrator/workplace/work/project_6/data/prediction/96_2_2mixcol_validation.csv"
path_t="/home/administrator/workplace/work/project_6/data/prediction/96_2_2mixcol_test.csv"

cache_folder="/home/administrator/workplace/work/project_6"+"/cache"

# 96/2/2

## Validation part

In [9]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path=train_path, 
     validation_path=validation_path, 
     prediction_path=path_v, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=10, regularization=0.015, alpha=1, num_iter=15))

WRMF work starts
Train WRMF
Predict by WRMF


In [10]:
val_val96=validation(validation_path, path_v, 20, 
                     names=["user", "item","is_relevant", "impl", "time"])

Test converting


HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))


Prediction converting


HBox(children=(IntProgress(value=0, max=58919), HTML(value='')))




In [11]:
# default
val_val96

0.02793314639531309

In [12]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path=train_path, 
     validation_path=validation_path, 
     prediction_path=path_v, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=15, regularization=0.015, alpha=1, num_iter=30))

WRMF work starts
Train WRMF
Predict by WRMF


In [13]:
val_val96=validation(validation_path, path_v, 20, 
                     names=["user", "item","is_relevant", "impl", "time"])

Test converting


HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))


Prediction converting


HBox(children=(IntProgress(value=0, max=58919), HTML(value='')))




In [14]:
# num_factors=15, regularization=0.015, alpha=1, num_iter=30
val_val96

0.028713996742539598

In [16]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path=train_path, 
     validation_path=validation_path, 
     prediction_path=path_v, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=15, regularization=0.015, alpha=5, num_iter=35))

WRMF work starts
Train WRMF
Predict by WRMF


In [17]:
val_val96=validation(validation_path, path_v, 20, 
                     names=["user", "item","is_relevant", "impl", "time"])

Test converting


HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))


Prediction converting


HBox(children=(IntProgress(value=0, max=58919), HTML(value='')))




In [18]:
# num_factors=15, regularization=0.015, alpha=5, num_iter=35
val_val96

0.03091348402082651

In [19]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path=train_path, 
     validation_path=validation_path, 
     prediction_path=path_v, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=20, regularization=0.015, alpha=10, num_iter=35))

WRMF work starts
Train WRMF
Predict by WRMF


In [20]:
val_val96=validation(validation_path, path_v, 20, 
                     names=["user", "item","is_relevant", "impl", "time"])

Test converting


HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))


Prediction converting


HBox(children=(IntProgress(value=0, max=58919), HTML(value='')))




In [21]:
# num_factors=20, regularization=0.015, alpha=10, num_iter=35
val_val96

0.031164385398249835

## Test part

In [22]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path="/home/administrator/workplace/work/rekko/data/implicit/all_datasets/96_2_2/train_val.csv", 
     validation_path=test_path, 
     prediction_path=path_t, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=20, regularization=0.015, alpha=10, num_iter=35))

WRMF work starts
Train WRMF
Predict by WRMF


In [25]:
val_val96_t=validation(test_path, path_t, 20, 
                     names=["user", "item","is_relevant", "impl", "time"])

Test converting


HBox(children=(IntProgress(value=0, max=58391), HTML(value='')))


Prediction converting


HBox(children=(IntProgress(value=0, max=55624), HTML(value='')))




In [26]:
val_val96_t

0.03092766235540828

In [27]:
(0.031164385398249835 + 0.03092766235540828)/2

0.03104602387682906

# Submition

In [29]:
test_path="/home/administrator/workplace/work/project_6/data/okko/test.csv"
big_train_path="/home/administrator/workplace/work/project_6/data/implicit/all_datasets/96_2_2mixcol/all_for_train.csv"
prediction_path="/home/administrator/workplace/work/project_6/WRMtest_cf.csv"

In [None]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path=big_train_path, 
     validation_path=test_path, 
     prediction_path=prediction_path, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=20, regularization=0.015, alpha=10, num_iter=35))

WRMF work starts
Train WRMF


In [None]:
prediction = pd.read_csv("/home/administrator/workplace/work/project_6/WRMtest.csv", names=["user", "item", "score"])
prediction.head()

### Impl

In [26]:
mymedialite_folder = "/home/administrator/libraries/MyMediaLite"

train_path = data_path + "all_datasets/96_2_2/train.csv"
validation_path = data_path + "all_datasets/96_2_2/validation.csv"
test_path = data_path + "all_datasets/96_2_2/test.csv"

path_v="/home/administrator/workplace/work/project_6/data/prediction/96_2_2_validation.csv"
path_t="/home/administrator/workplace/work/project_6/data/prediction/96_2_2_test.csv"

cache_folder="/home/administrator/workplace/work/project_6"+"/cache"

In [27]:
WRMF(mymedialite_folder=mymedialite_folder, 
     train_path=train_path, 
     validation_path=validation_path, 
     prediction_path=path_v, 
     cache_folder=cache_folder,
     k=20, random_seed=42, 
     parameters=dict(num_factors=10, regularization=0.015, alpha=1, num_iter=15))

WRMF work starts
Train WRMF
Predict by WRMF


In [28]:
val_val96=validation(validation_path=validation_path, 
                     prediction_path=path_v, k=20,
                      max_threads=3,
                     input_names=["user", "item", "impl", "is_relevant", "time"])

Read Data
Convert test
Convert prediction
Compute metrics


In [29]:
val_val96

0.010444427882973707

In [93]:
mymedialite_folder = "/home/administrator/libraries/MyMediaLite"

train_path = data_path + "all_datasets/96_2_2mixcol/train.csv"
validation_path = data_path + "all_datasets/96_2_2mixcol/validation.csv"
test_path = data_path + "all_datasets/96_2_2mixcol/test.csv"

path_v="/home/administrator/workplace/work/project_6/data/prediction/96_2_2mixcol_validation.csv"
path_t="/home/administrator/workplace/work/project_6/data/prediction/96_2_2mixcol_test.csv"

cache_folder="/home/administrator/workplace/work/project_6"+"/cache"

In [94]:
validation(validation_path, path_v, 20,
           names=["user", "item","is_relevant","impl", "time"])

Test converting


HBox(children=(IntProgress(value=0, max=62666), HTML(value='')))


Prediction converting


HBox(children=(IntProgress(value=0, max=58919), HTML(value='')))




0.02793314639531309

In [77]:
average_precision(dict_val_rel, dict_pred, 20)

0.02517825453976722

In [67]:
average_precision(dict_val_rel, dict_pred, 20)

0.02793314639531309

In [None]:
validation_dict = {}
for user in list(df_part.user.unique()):
    validation_dict[user] = set(df_part.loc[df_part.user == user, "item"])