In [None]:
!pip install numpy cvxpy tqdm matplotlib seaborn mosek

In [1]:
import numpy as np
import json
import cvxpy as cp
from collections import defaultdict, Counter
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import requests

  """


(CVXPY) Apr 14 09:18:34 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.12.4544). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Apr 14 09:18:34 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.12.4544). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')


In [2]:
sns.set_style("whitegrid")
np.random.seed(2)

In [2]:
# download data from dropbox
resp = requests.get("https://www.dropbox.com/scl/fi/05woj2mttjwzl64q29x5u/amazon_predictions.npy?rlkey=mnxzgbos28wr9t47i7ezgayow&st=kghbwpn3&dl=1")
with open("amazon_predictions.npy", "wb") as f:
    f.write(resp.content)

resp = requests.get("https://www.dropbox.com/scl/fi/fyiu6dwf4b5959tjsp8jw/amazon_user_groups.json?rlkey=8irakhur6nf4kieex0se9vsgh&st=0mlarlkb&dl=1")
with open("amazon_user_groups.json", "wb") as f:
    f.write(resp.content)

In [4]:
with open("amazon_predictions.npy", "rb") as f:
    REL_MATRIX = np.load(f)

with open("amazon_user_groups.json", "r") as f:
    GROUPS_MAP = json.load(f)

In [None]:
# alpha parameter used within cvar
ALPHA = 0.7
# for which gamma values to run the experiment (consumer-producer trade-off)
GAMMA_POINTS = [0, 0.35, 0.7, 1]
# select for which groups to run the experiment
GROUPS = ["usage_group"]
# GROUPS = ["top_category", "usage_group"]

# number of runs
N_RUNS = 3
# number of consumers
N_CONSUMERS = 300
# number of producers
N_PRODUCERS = 100
# number of items to be recommended
K_REC = 10

# do not modify these values as they are used for extracting correct attributes from solution
WEIGHTS_IDX = 1
RHO_IDX = 0

Switch solvers between SCIP or MOSEK. MOSEK is more powerful. but requires license

In [6]:
# SOLVER = cp.MOSEK
SOLVER = cp.SCIP

In [None]:
def sample_utility(m, n, sample_m, sample_n, W):
    rng = np.random.default_rng()

    users = rng.choice(m, size=sample_m, replace=False)
    items = rng.choice(n, size=sample_n, replace=False)
    return W[users][:, items]


def sample_candidate_items(rel_matrix_users: np.ndarray, n_candidates: int) -> np.ndarray:
    top_items = np.argsort(rel_matrix_users, axis=1)[:, -n_candidates:]
    unique_items = np.unique(top_items)
    return rel_matrix_users[np.arange(rel_matrix_users.shape[0])[:, None], unique_items]


def _parse_groups_ids(consumer_ids, groups_map: list[dict[str, int]], group_key: str):
    """Parses given group key value to int for each consumer"""

    consumer_groups = [i[group_key] for i in groups_map if i["user_id"] in consumer_ids]
    all_groups = list(set(consumer_groups))

    return np.array([all_groups.index(i) for i in consumer_groups])

def sample_users_from_groups(
    users_count: int,
    items_count: int,
    groups_map: list[dict],  # assumed list of dicts, each with group_key and "user_id"
    group_key: str,
    data: np.ndarray,
    naive_sampling: bool = True,
) -> tuple[np.ndarray, list[int], list[int]]:
    # Count how many times each group appears.
    group_freq = Counter(row[group_key] for row in groups_map)

    # Calculate initial allocation using proportions.
    # (Total items in groups_map is used as denominator to reflect each group's share.)
    initial_allocation = {
        group: max(round(users_count * count / len(groups_map)), 1)
        for group, count in group_freq.items()
    }

    # Adjust allocation so that sum equals exactly users_count.
    allocated_total = sum(initial_allocation.values())
    # Identify the "biggest" group (with the most occurrences)
    biggest_group = max(group_freq.items(), key=lambda x: x[1])[0]

    if allocated_total > users_count:
        # Too many allocated; remove the difference from the largest group, but keep at least one user.
        diff = allocated_total - users_count
        initial_allocation[biggest_group] = max(initial_allocation[biggest_group] - diff, 1)
    elif allocated_total < users_count:
        # Too few allocated; add the shortfall to the largest group.
        diff = users_count - allocated_total
        initial_allocation[biggest_group] += diff

    # Build a mapping from group key to all available user IDs.
    group_users = defaultdict(list)
    for row in groups_map:
        group_users[row[group_key]].append(row["user_id"])

    # Sample users from each group according to the final allocation.
    sampled_users = []
    for group, users in group_users.items():
        sampled_users.extend(
            np.random.choice(users, initial_allocation[group], replace=False)
        )

    # Use your helper to parse the groups of the sampled users.
    group_assignments = _parse_groups_ids(sampled_users, groups_map, group_key)

    rng = np.random.default_rng()
    if naive_sampling:
        # Sample random item indices.
        items = rng.choice(data.shape[1], size=items_count, replace=False)
        return data[sampled_users][:, items], sampled_users, group_assignments

    rel_matrix_sampled = data[sampled_users]
    top_items = sample_candidate_items(rel_matrix_sampled, items_count)
    return top_items, sampled_users, group_assignments


def compute_producer_optimal_solution(rel_matrix: np.ndarray, k_rec: int) -> cp.Problem:
    x_alloc = cp.Variable(rel_matrix.shape, boolean=True)

    # constraints
    constraints = [
        # recommend k items
        cp.sum(x_alloc, axis=1) == k_rec,
    ]

    # maximize the minimal item utility
    problem = cp.Problem(
        cp.Maximize(cp.min(cp.sum(x_alloc, axis=0))),
        constraints,
    )
    problem.solve(solver=SOLVER)

    return problem

def compute_consumer_optimal_solution(
    rel_matrix: np.ndarray,
    group_assignments: np.ndarray,
    k_rec: int,
    gamma: float,
    producer_max_min_utility,
    alpha: float = 0.95
) -> cp.Problem:
    # producer allocations
    x_alloc = cp.Variable(rel_matrix.shape, boolean=True)

    constraints = [
        # there should be k_rec producers allocated to consumer
        cp.sum(x_alloc, axis=1) == k_rec,
        # each producer should get at least gamma * optimal producer utility
        cp.sum(x_alloc, axis=0) >= gamma * producer_max_min_utility,
    ]

    # greedy producer allocations for consumer
    greedy_allocations = np.sort(rel_matrix, axis=1)[:, -k_rec:].sum(axis=1)

    # precomputing values for later processing
    unique_groups, group_indices = np.unique(group_assignments, return_inverse=True)
    num_groups = len(unique_groups)
    group_masks = [group_indices == i for i in range(num_groups)]
    group_sizes = np.array([mask.sum() for mask in group_masks])


    allocations = cp.sum(cp.multiply(rel_matrix, x_alloc), axis=1)
    # Compute normalized losses for all groups simultaneously (vectorized)
    normalized_losses = []
    for mask, size in zip(group_masks, group_sizes):
        group_alloc = allocations[mask]
        greedy_group_alloc = greedy_allocations[mask]

        # compute loss for each group
        normalized_loss = cp.sum(1 - (group_alloc / greedy_group_alloc)) / size
        normalized_losses.append(normalized_loss)

    # CVaR computation (vectorized)
    rho = cp.Variable(nonneg=True)
    cvar_objective = rho + (1 / ((1 - alpha) * num_groups)) * cp.sum(cp.pos(cp.hstack(normalized_losses) - rho))

    # Define and solve the optimization problem
    problem = cp.Problem(cp.Minimize(cvar_objective), constraints)
    problem.solve(solver=SOLVER)

    return problem


def compute_consumer_producer_utils_per_gamma(
    rel_matrix: np.ndarray, k_rec: int, gamma_points: list[float], group_assignments: list[int], alpha: float
) -> tuple[list[list[float]], list[list[float]]]:
    producer_max_min_utility = compute_producer_optimal_solution(rel_matrix, k_rec).value
    greedy_allocations_per_consumer = np.sort(rel_matrix, axis=1)[:, -k_rec:].sum(axis=1)

    consumers_utils = []
    producers_utils = []

    for gamma in gamma_points:
        print("Computing alloactions for gamma:", gamma)
        v_user_result = compute_consumer_optimal_solution(
            rel_matrix,
            group_assignments,
            k_rec,
            gamma,
            producer_max_min_utility,
            alpha=alpha,

        )
        producer_assignments = v_user_result.variables()[WEIGHTS_IDX].value
        consumers_utils.append(
            (rel_matrix * producer_assignments).sum(axis=1) / greedy_allocations_per_consumer
        )
        producers_utils.append(producer_assignments.sum(axis=0))

    return consumers_utils, producers_utils


def compute_consumer_producer_utils_per_gamma_for_groups(
    rel_matrix: np.ndarray,
    n_consumers: int,
    n_producers: int,
    k_rec: int,
    n_runs: int,
    gamma_points: list[float],
    groups_map: list[dict[str, int]],
    group_key: str,
    alpha: float,
    naive_sampling: bool = True,
) -> tuple[np.ndarray, list[int], list[float]]:
    consumers_ids = []
    consumers_utils = []
    for _ in tqdm(range(n_runs)):
        rel_matrix_sampled, consumer_ids_sampled, group_assignments = sample_users_from_groups(
            n_consumers, n_producers, groups_map, group_key, rel_matrix, naive_sampling
        )
        consumer_utils_run, _ = compute_consumer_producer_utils_per_gamma(rel_matrix_sampled, k_rec, gamma_points, group_assignments, alpha)

        consumers_ids.append(consumer_ids_sampled)
        consumers_utils.append(consumer_utils_run)

    return consumers_ids, consumers_utils

In [None]:
def parse_results(
    consumers_ids: list[list[int]],
    consumers_utils: list[list[int]],
    n_runs: int,
    gamma_points: list[float],
    group_key: str
):
    distinct_groups = {row[group_key] for row in GROUPS_MAP if row["user_id"] in consumers_ids[0]}
    results = defaultdict(lambda: defaultdict(list))

    for group in distinct_groups:
        for gamma_id, gamma in enumerate(gamma_points):
            for run_id in range(n_runs):
                run_consumers_ids = consumers_ids[run_id]
                run_consumer_groups = np.array([i[group_key] for i in GROUPS_MAP if i["user_id"] in run_consumers_ids])
                run_consumer_groups_idx = np.where(run_consumer_groups == group)[0]
                run_consumers_utils = consumers_utils[run_id]
                results[group][gamma].append(run_consumers_utils[gamma_id][run_consumer_groups_idx].mean())

    consumer_utils_mean = np.mean(consumers_utils, axis=2).mean(axis=0)

    return results, consumer_utils_mean

In [None]:
def plot_groups_results(results: dict, consumer_utils_mean: np.ndarray, groups_key, group_names, save_path: str) -> None:
    plt.figure(figsize=(10,6), dpi=300)
    sns.set_palette(sns.color_palette("husl", len(group_names)))
    # sort results by group name
    for group in results.items():
        group_name, _gammas = group
        gamma_means = {}
        gamma_std_err = {}
        for gamma, runs in _gammas.items():
            gamma_means[gamma] = np.mean(runs)
            gamma_std_err[gamma] = np.std(runs) / np.sqrt(N_RUNS)


        plt.plot(
            list(gamma_means.keys()),
            list(gamma_means.values()),
            label=f"{group_name.capitalize()}",
        )
        plt.fill_between(
            list(gamma_std_err.keys()),
            np.array(list(gamma_means.values())) - np.array(list(gamma_std_err.values())),
            np.array(list(gamma_means.values())) + np.array(list(gamma_std_err.values())),
            alpha=0.1,
        )

    plt.ylabel('Normalized consumer utility')
    plt.title(f"Consumer and producer utility tradeoff for retrieving k=10 items ({groups_key.replace("group", "").replace('_', ' ')} group)")
    plt.xlabel(r'Fraction of best min normalized item utility guaranteed, $\gamma^I$')
    plt.plot(GAMMA_POINTS, consumer_utils_mean, color="black", label="Mean user utility", linestyle="--")
    plt.legend()
    plt.savefig(f"{save_path}/tradeoff_curve_group_{groups_key}_n_consumers_{N_CONSUMERS}_n_producers_{N_PRODUCERS}_n_runs_{N_RUNS}_k_rec_{K_REC}_alpha_{ALPHA}.png")

In [None]:
for group in GROUPS:
    group_names = sorted({i[group] for i in GROUPS_MAP})

    consumers_ids, consumers_utils = compute_consumer_producer_utils_per_gamma_for_groups(
        rel_matrix=REL_MATRIX,
        n_consumers=N_CONSUMERS,
        n_producers=N_PRODUCERS,
        n_runs=N_RUNS,
        gamma_points=GAMMA_POINTS,
        k_rec=K_REC,
        groups_map=GROUPS_MAP,
        group_key=group,
        alpha=ALPHA
    )

    results, consumer_utils_mean = parse_results(consumers_ids, consumers_utils, N_RUNS, GAMMA_POINTS, group)
    plot_groups_results(results, consumer_utils_mean, group, group_names, "./media23")