<hr>

# Trying out user-user collaborative filtering

In [None]:
"""Psudeocode
* Create customer profiles for all customers in (sampled) dataset
    * i.e. each customer ID has a vector r_ID whose elements represent items purchased
* Compute Jaccard similarity between all r_IDs, independent of position
* For a given customer x, choose the k customers closest to x
* For an article i, wether or not to recommend is based on the recommendation score
    r(x, i) = mean( [rel(y, i) for y in top k] )
"""


def position_indep_jaccard(x: list | set, y: list | set) -> float:
    # Position-independent jaccard-similarity
    x, y = set(x), set(y)
    return len(x.intersection(y)) / len(x.union(y))



def find_customer_similarity(
    df_customer: pd.DataFrame, df_transactions: pd.DataFrame
) -> Tuple[pd.DataFrame, dict]:
    articles_dict = {}
    for cust_ID in df_customer["customer_id"]:
        articles_dict[cust_ID] = df_transactions["article_id"][
            df_transactions["customer_id"] == cust_ID
        ].to_list()
        # Pop customers without purchase history
        if len(articles_dict[cust_ID]) == 0:
            articles_dict.pop(cust_ID)
    num_customers = len(df_customer)
    print(f"{num_customers = }")
    similarity_matrix = np.zeros((num_customers, num_customers))
    # Iterate over customers:
    for r, cust in enumerate(articles_dict.keys()):
        for c, second in enumerate(articles_dict.keys()):
            sim = position_indep_jaccard(articles_dict[cust], articles_dict[second])
            similarity_matrix[r, c] = sim

    return (
        pd.DataFrame(
            similarity_matrix, index=articles_dict.keys(), columns=articles_dict.keys()
        ),
        articles_dict,
    )


def get_recommendation(
    similarity_matrix: pd.DataFrame,
    articles_dict: dict,
    customer_ID: str,
    article_ID: int,
    k: int,
) -> float:
    """Produce recommendation score of an item based on its k closest customer behaviors

    Args:
        similarity_matrix (pd.DataFrame): nxn matrix of similarities between customers
        articles_dict (dict): Dictionary of customer purchases on form {customer_id: [item1, item2, ...]}
        customer_ID (str): The customer the recommendation score is based on
        article_ID (int): The article the score is based on
        k (int): How many (closest) customer-neighbors to include in computation.

    Returns:
        float: Measure of how well the item would fit the customer in question, between [0,1]
    """
    # The k most similar customers IDs:
    closest_customers = (
        similarity_matrix[customer_ID].sort_values(ascending=False)[:k].index
    )
    return (
        sum(1 if article_ID in articles_dict[cust] else 0 for cust in closest_customers)
        / k
    )

In [None]:
# Load sample data
df_cust = pd.read_csv("dataset_sample/customer_min.csv")
df_tr = pd.read_csv("dataset_sample/transactions_min.csv")
df_art = pd.read_csv("dataset_sample/articles_min.csv")
sim_matr, art_dict = find_customer_similarity(df_cust, df_tr)

In [None]:
def get_n_recommendations(
    n: int,
    similarity_matrix: pd.DataFrame,
    articles_dict: dict,
    customer_ID: str,
    k: int,
    ignore_purchased: bool = True,
) -> list:
    """Get the n 'best' recommended items for a specific customer ID

    Args:
        n (int): How many items to recommend
        similarity_matrix (pd.DataFrame): Customer similarity matrix
        articles_dict (dict): Dictionary of customer purchases on form {customer_id: [item1, item2, ...]}
        customer_ID (str): _description_
        k (int): _description_
        ignore_purchased (bool, optional): _description_. Defaults to True.

    Returns:
        list: _description_
    """
    # Get rec. score for all cases and choose n with highest score
    # ignore_purchased to ignore those articles customer has already bought
    blacklisted_articles = (
        set(articles_dict[customer_ID]) if ignore_purchased else set()
    )
    art_IDs = set(itertools.chain(*articles_dict.values())) - blacklisted_articles
    score_dict = {
        art_ID: get_recommendation(
            similarity_matrix, articles_dict, customer_ID, art_ID, k
        )
        for art_ID in art_IDs
    }
    n_best_items = {
        k: v for k, v in sorted(score_dict.items(), key=lambda el: el[1], reverse=True)
    }
    # Return entire dict for debug purposes, but otherwise just the article IDs (not scores)
    return list(itertools.islice(n_best_items.items(), n))
    return list(n_best_items.keys())[:n]

In [None]:
get_n_recommendations(
    n=5,
    similarity_matrix=sim_matr,
    articles_dict=art_dict,
    customer_ID="008068b49b6bdd622ed406e30c8603270770174ebf300dbac0f5beac522921e0",
    k=5,
)

In [None]:

get_recommendation(
    similarity_matrix=sim_matr,
    articles_dict=art_dict,
    customer_ID='008068b49b6bdd622ed406e30c8603270770174ebf300dbac0f5beac522921e0',
    article_ID=556255001,
    k=5
)


In this case, two of the $k$ closest customers (including the customer itself) has bought the article in question. Thus we get a score of $\frac25=0.4$