In [1]:
import logging
import multiprocessing
import os
from datetime import datetime

import numpy as np
import pandas as pd

from lib.funcs import (
    compute_indicators,
    compute_final_stats,
    normalize_with_d,
    init_random_array,
    normalize_with_self,
    update_coefs,
    compute_likelihood,
    compute_prod_dist,
)
from lib.utils import get_data, check_data

In [14]:
# Constants
train_set = "train.csv"
test_set = "test.csv"
sampling = 1
# Number of groups of users
k = 3
# Number of groups of items
l = 6
# Iterations
# To plateau the coefficients the minimum is 600
iterations = 200
seed = 1

In [15]:
start_time = datetime.now()

# Initiate the random state
rng = np.random.default_rng(seed)
# Create seeds for each process
seeds = list(rng.integers(low=1, high=10000, size=sampling))

logger = logging.getLogger("MMSBM")
logging.basicConfig(level=logging.INFO)
logger.info(f"Running {sampling} runs of {iterations} iterations.")

# Get data
data_dir = os.path.join(os.getcwd(), "data")
train = get_data(os.path.join(data_dir, train_set))
check_data(train)
test = get_data(os.path.join(data_dir, test_set))
check_data(test)

# Create a few dicts with the relationships
# TODO: think whether initialization with 0 is needed
d0 = {0: []}
d1 = {0: []}
[d0.update({a: list(train[train[:, 0] == a, 1])}) for a in set(train[:, 0])]
[d1.update({a: list(train[train[:, 1] == a, 0])}) for a in set(train[:, 1])]
ratings = sorted(set(train[:, 2]))
r = len(ratings)
p = int(train[:, 0].max())
m = int(train[:, 1].max())

# If, for some reason, there are missing links, we need to fill them:
[d0.update({a: []}) for a in set(range(p + 1)).difference(set(d0.keys()))]
[d1.update({a: []}) for a in set(range(m + 1)).difference(set(d1.keys()))]

INFO:MMSBM:Running 1 runs of 200 iterations.


[]

100%|██████████| 10/10 [00:03<00:00,  3.04it/s]


In [17]:
aa["likelihood"].sum()

-48338.77380245385

In [18]:
rat = compute_indicators(aa["rat"], test, ratings)

In [19]:
accuracy, mae, s2, s2pond = compute_final_stats(rat)

In [20]:
mae

0.7463486992240986

In [10]:
a = pd.read_csv("data/train.csv", header=None)
b = pd.read_csv("data/test.csv", header=None)

In [11]:
b.loc[:, 0] += a.loc[:, 0].max() + 1
c = a.append(b)

In [12]:
c.to_csv("data/all.csv")
