In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import json
import cvxpy as cp
from collections import defaultdict, Counter
from tqdm import tqdm
import matplotlib.pyplot as plt

import seaborn as sns

sns.set_style("whitegrid")

  """


In [17]:
WEIGHTS_IDX = 0
RHO_IDX = 1

def sample_utility(m, n, sample_m, sample_n, W):
    rng = np.random.default_rng()

    users = rng.choice(m, size=sample_m, replace=False)
    items = rng.choice(n, size=sample_n, replace=False)
    return W[users][:, items]


def sample_candidate_items(rel_matrix_users: np.ndarray, n_candidates: int) -> np.ndarray:
    top_items = np.argsort(rel_matrix_users, axis=1)[:, -n_candidates:]
    unique_items = np.unique(top_items)
    return rel_matrix_users[np.arange(rel_matrix_users.shape[0])[:, None], unique_items]


def sample_users_from_groups(
    users_count: int,
    items_count: int,
    groups_map: dict[int, str],
    group_name: str,
    data: np.ndarray,
    naive_sampling: bool = True,
) -> tuple[np.ndarray, list[int]]:
    users_per_group = {
        group: max(round(users_count * count / len(groups_map)), 1)
        for group, count in Counter([i[group_name] for i in groups_map]).items()
    }
    group_users = defaultdict(list)
    for row in groups_map:
        group_users[row[group_name]].append(row["user_id"])

    sampled_users = []
    for group, users in group_users.items():
        sampled_users.extend(np.random.choice(users, users_per_group[group], replace=False))

    rng = np.random.default_rng()
    if naive_sampling:
        items = rng.choice(data.shape[1], size=items_count, replace=False)
        return data[sampled_users][:, items], sampled_users

    rel_matrix_sampled = data[sampled_users]
    top_items = sample_candidate_items(rel_matrix_sampled, items_count)
    return top_items, sampled_users


def best_unconstrained(rel_matrix: np.ndarray, k_rec: int) -> cp.Problem:
    n_u, n_i = rel_matrix.shape
    x_alloc = cp.Variable((n_u, n_i), boolean=True)

    # constraints
    constraints = []
    # recommend k items
    constraints.append(cp.sum(x_alloc, axis=1) == k_rec)

    problem = cp.Problem(cp.Maximize(cp.min(cp.sum(cp.multiply(x_alloc, rel_matrix), axis=1))), constraints)
    problem.solve(solver=cp.SCIP)

    return problem





def constrained_maxmin_item_given_user(rel_matrix: np.ndarray, k_rec: int) -> cp.Problem:
    x_alloc = cp.Variable(rel_matrix.shape, boolean=True)

    # constraints
    constraints = [
        # recommend k items
        cp.sum(x_alloc, axis=1) == k_rec,
    ]

    # maximize the minimal item utility
    problem = cp.Problem(
        cp.Maximize(cp.min(cp.sum(x_alloc, axis=0))),
        constraints,
    )
    problem.solve(solver=cp.SCIP)

    return problem


def get_user_curve(
    rel_matrix: np.ndarray, k_rec: int, gamma_points: list[float], groups
) -> list[tuple[float, float]]:
    # Now, do the actual convex optimization
    item_min_max = constrained_maxmin_item_given_user(rel_matrix, k_rec).value
    user_min_max = constrained_maxmin_user_given_item(rel_matrix, groups, k_rec, 0)
    user_min_max_val = user_min_max.value
    print(user_min_max.variables()[RHO_IDX].value)
    user_min_max_val_user_level = (user_min_max.variables()[WEIGHTS_IDX].value * rel_matrix).sum(axis=1)

    pairs = []
    u_utils = []
    i_utils = []
    for gamma_item in gamma_points:
        print(gamma_item)
        v_user_result = constrained_maxmin_user_given_item(
            rel_matrix,
            groups,
            k_rec,
            gamma_item * item_min_max,
        )
        u_utils.append(
            (rel_matrix * v_user_result.variables()[WEIGHTS_IDX].value).sum(axis=1) / user_min_max_val_user_level
        )
        i_utils.append(v_user_result.variables()[WEIGHTS_IDX].value.sum(axis=0))
        # u_utils.append((rel_matrix * v_user_result.variables()[0].value).sum(axis=1))
        pair = (gamma_item, v_user_result.value / user_min_max_val)
        pairs.append(pair)

    return pairs, u_utils, i_utils


def get_curves_user_groups(
    rel_matrix: np.ndarray,
    u_sample: int,
    i_sample: int,
    n_runs: int,
    gamma_points: list[float],
    k_rec: int,
    groups_map: list[dict[str, int]],
    group_name: str,
    naive_sampling: bool = True,
) -> tuple[np.ndarray, list[int], list[float]]:
    all_empirical_pairs = []
    users_ids = []
    u_utils = []
    for _ in tqdm(range(n_runs)):
        rel_matrix_sampled, _users_ids = sample_users_from_groups(
            u_sample, i_sample, groups_map, group_name, rel_matrix, naive_sampling
        )
        user_groups = [i[group_name] for i in groups_map if i["user_id"] in _users_ids]
        # convert user groups to integers
        all_groups = list(set(user_groups))
        groups = np.array([all_groups.index(i) for i in user_groups])
        pairs, _u_utils, _ = get_user_curve(rel_matrix_sampled, k_rec, gamma_points, groups)
        users_ids.append(_users_ids)

        u_utils.append(_u_utils)
        all_empirical_pairs.append(pairs)

    return np.array(all_empirical_pairs), users_ids, u_utils


def get_curves_user_default(
    rel_matrix: np.ndarray,
    u_sample: int,
    i_sample: int,
    n_runs: int,
    gamma_points: list[float],
    k_rec: int,
    groups_map: list[dict[str, int]],
    group_name: str,
    naive_sampling: bool = True,
) -> tuple[np.ndarray, list[int], list[float]]:
    all_empirical_pairs = []
    users_ids = []
    u_utils = []
    i_utils = []
    for _ in tqdm(range(n_runs)):
        rel_matrix_sampled = sample_utility(
            rel_matrix.shape[0], rel_matrix.shape[1], u_sample, i_sample, rel_matrix
        )
        pairs, _u_utils, _i_utils = get_user_curve(rel_matrix_sampled, k_rec, gamma_points)
        u_utils.append(_u_utils)
        i_utils.append(_i_utils)
        all_empirical_pairs.append(pairs)

    return np.array(all_empirical_pairs), users_ids, u_utils, i_utils


In [4]:
with open("amazon_predictions.npy", "rb") as f:
    u_rel_matrx = np.load(f)

with open("amazon_user_groups.json", "r") as f:
    user_groups = json.load(f)

N_USERS, N_ITEMS = u_rel_matrx.shape

In [20]:

def relu(x):
    return x * (x > 0)

def constrained_maxmin_user_given_item(rel_matrix: np.ndarray, groups: list[int], k_rec: int, v: float) -> cp.Problem:
    x_alloc = cp.Variable(rel_matrix.shape, boolean=True)
    best_alloc = np.sort(rel_matrix, axis=1)[:, -k_rec:].sum(axis=1)
    rho = cp.Variable()

    # constraints
    constraints = [
        # recommend k items
        cp.sum(x_alloc, axis=1) == k_rec,
        # minimal item utility must be at least v
        cp.sum(x_alloc, axis=0) >= v,
        rho >= 0,  # ensure CVaR risk threshold is non-negative
    ]

    alpha = 0.95
    alpha_norm = (1 / (1 - alpha))

    overall_loss = 1 - (cp.sum(cp.multiply(x_alloc, rel_matrix), axis=1) / best_alloc)

    group_losses = []
    for group in np.unique(groups):
        group_mask = (groups == group)
        best_group_alloc_per_row = cp.multiply(group_mask[group_mask], best_alloc[group_mask])
        group_alloc_per_row = cp.sum(cp.multiply(x_alloc[group_mask], rel_matrix[group_mask]), axis=1)
        group_loss_per_row = 1 - (group_alloc_per_row / best_group_alloc_per_row)
        group_cvar = rho + alpha_norm * cp.sum((group_loss_per_row - rho)) / group_mask.sum()
        group_losses.append(group_cvar)

    group_losses = cp.vstack(group_losses)

    # maximize the minimal user utility
    problem = cp.Problem(
        cp.Minimize(cp.mean(overall_loss) + cp.std(group_losses)),
        constraints,
    )
    problem.solve(solver=cp.SCIP)

    return problem



for group_name in ["usage_group", "top_category"]:
    group_names = sorted({i[group_name] for i in user_groups})
    # predefined colors
    colors = sns.color_palette("husl", len(group_names))

    gamma_points = np.linspace(0, 1, 4)
    n_runs = 16
    user_size = 500
    items_size = 200
    top_k = 10

    all_empirical_pairs, users_ids, u_utils = get_curves_user_groups(
        u_rel_matrx, user_size, items_size, n_runs, gamma_points, top_k, user_groups, group_name
    )

    results = defaultdict(dict)
    gammas = all_empirical_pairs[0, :, 0]

    for gamma, _ in enumerate(gamma_points):
        group_utils_mean = defaultdict(list)
        for draw in range(n_runs):
            group_users = defaultdict(list)
            for row in user_groups:
                user_id = row["user_id"]
                if user_id in users_ids[draw]:
                    group_users[row[group_name]].append(users_ids[draw].index(user_id))

            for group, users in group_users.items():
                group_utils_mean[group].append(np.take(u_utils[draw][gamma], users).mean())

        for group, utils in group_utils_mean.items():
            results[group][gamma] = {"mean": np.mean(utils), "std": np.std(utils)}

    plt.figure(figsize=(10, 6), dpi=300)
    # set the color cycle
    sns.set_palette(colors)
    # sort results by group name
    results = dict(sorted(results.items(), key=lambda x: x[0]))
    for group, data in results.items():
        plt.plot(gammas, [i["mean"] for i in data.values()], label=group.replace("_", " ").capitalize())
        standard_err = np.array([i["std"] for i in data.values()]) / np.sqrt(n_runs)
        plt.fill_between(
            gammas,
            [i["mean"] for i in data.values()] - standard_err,
            [i["mean"] for i in data.values()] + standard_err,
            alpha=0.1,
        )

    plt.ylabel("Normalized consumer utility")
    plt.title(
        f"Consumer and producer utility tradeoff for retrieving k=10 items ({group_name.replace('group', '').replace('_', ' ')} group)"
    )
    plt.xlabel(r"Fraction of best min normalized item utility guaranteed, $\gamma^I$")
    mean_utils = np.array(u_utils).mean(axis=2)
    plt.plot(gammas, mean_utils.mean(axis=0), color="black", label="Mean user utility", linestyle="--")
    standard_err = np.std(mean_utils, axis=0) / np.sqrt(n_runs)
    plt.fill_between(
        gammas,
        np.array(u_utils).mean(axis=2).mean(axis=0) - standard_err,
        np.array(u_utils).mean(axis=2).mean(axis=0) + standard_err,
        alpha=0.1,
        color="black",
    )
    plt.legend()
    plt.savefig(
        f"./media13/tradeoff_curve_group_{group_name}_user_size_{user_size}_items_{items_size}_n_draws_{n_runs}_top_k_{top_k}.png"
    )

  0%|          | 0/16 [00:00<?, ?it/s]

-0.0
0.0
0.3333333333333333
pressed CTRL-C 1 times (5 times for forcing termination)


  0%|          | 0/16 [1:50:55<?, ?it/s]


SolverError: Solver 'SCIP' failed. Try another solver, or solve with verbose=True for more information.

In [12]:
np.maximum([-2, 3, 4], 0)

array([0, 3, 4])

In [72]:
y_true = np.random.rand(100)
y_pred = np.random.rand(100)
# simulating lower values for y_pred
#y_pred = y_true - (y_true * y_pred_perc)

loss = 1 - (y_pred / y_true)

In [77]:
scores[0]

array([0.89730974, 0.70357554, 0.26809903, 0.72730359, 0.13076508,
       0.70792413, 0.94750268, 0.5143962 , 0.88289328, 0.02761776,
       0.41945139, 0.18216466, 0.77582289, 0.66236755, 0.89164294,
       0.83380971, 0.90720584, 0.0547312 , 0.47996534, 0.90683753])

In [None]:
scores = np.random.rand(100, 20)
np.sort(scores, axis=1)[:, -10:]

array([[0.70792413, 0.72730359, 0.77582289, 0.83380971, 0.88289328,
        0.89164294, 0.89730974, 0.90683753, 0.90720584, 0.94750268],
       [0.57905123, 0.65348549, 0.6868211 , 0.7353118 , 0.77363888,
        0.81228213, 0.87349064, 0.92910574, 0.94386503, 0.9997092 ],
       [0.46680332, 0.51418012, 0.5196878 , 0.60762381, 0.67967453,
        0.84135418, 0.88176218, 0.88831414, 0.90976316, 0.96752888],
       [0.66428738, 0.70173547, 0.71452467, 0.71890579, 0.81261151,
        0.85674178, 0.86126242, 0.86693867, 0.90131506, 0.97455762],
       [0.58319024, 0.58840101, 0.73282446, 0.80413999, 0.81871806,
        0.86393164, 0.88307971, 0.93537992, 0.93966048, 0.99724182],
       [0.57248641, 0.64668639, 0.67626719, 0.76345527, 0.77198801,
        0.80196723, 0.86043194, 0.87795004, 0.92944529, 0.996604  ],
       [0.55182146, 0.59756124, 0.62684686, 0.66997404, 0.69363648,
        0.72092416, 0.77039904, 0.86444595, 0.924068  , 0.94908275],
       [0.64772937, 0.65189133, 0.7213048

In [73]:
loss

array([ 4.18920095e-01,  7.16451658e-01,  1.88761250e-01, -5.25297012e-01,
       -2.72824139e-01,  3.60480233e-01,  8.92674031e-02,  5.66378489e-01,
        1.56261698e-01, -2.35147830e-01,  5.93343979e-01,  5.26936512e-01,
        4.51004344e-01,  9.94193768e-01,  9.70397674e-01, -1.53400126e-01,
        8.44482684e-01, -3.02487823e+00, -1.50524253e+02, -1.11937274e-01,
       -8.54622327e+00,  6.42202076e-02, -8.34062337e-01,  9.98163496e-01,
        2.58426320e-01,  2.85599192e-01,  1.28300252e-01, -2.70000710e-01,
        9.69528636e-01, -4.13062193e-01,  2.55103072e-01,  1.25425605e-01,
       -7.65948529e+00,  6.26429108e-01,  3.63310680e-01, -2.85780784e+00,
        6.38045686e-02,  2.39629473e-01, -2.12020010e+00, -6.60941038e-01,
        5.09010261e-01,  9.96803144e-01,  3.15865914e-01,  9.92622657e-01,
       -6.30872022e-01,  9.75550604e-01,  3.76501099e-01,  1.79765050e-02,
       -1.29826993e+00,  6.39363742e-01,  8.94106414e-01,  4.39396618e-01,
       -2.00562467e+00, -

In [None]:
rho =

rho + np.mean(loss - rho) * 0.05

np.float64(0.9738943879626398)