<hr>

# Trying out user-user collaborative filtering

In [None]:
"""Psudeocode
* Create customer profiles for all customers in (sampled) dataset
    * i.e. each customer ID has a vector r_ID whose elements represent items purchased
* Compute Jaccard similarity between all r_IDs, independent of position
* For a given customer x, choose the k customers closest to x
* For an article i, wether or not to recommend is based on the recommendation score
    r(x, i) = mean( [rel(y, i) for y in top k] )
"""
from typing import Tuple
import numpy as np, pandas as pd
import itertools

def position_indep_jaccard(x: list | set, y: list | set) -> float:
    # Position-independent jaccard-similarity
    x, y = set(x), set(y)
    return len(x.intersection(y)) / len(x.union(y))



def find_customer_similarity(
    df_customer: pd.DataFrame, df_transactions: pd.DataFrame
) -> Tuple[pd.DataFrame, dict]:
    articles_dict = {}
    for cust_ID in df_customer["customer_id"]:
        articles_dict[cust_ID] = df_transactions["article_id"][
            df_transactions["customer_id"] == cust_ID
        ].to_list()
        # Pop customers without purchase history
        if len(articles_dict[cust_ID]) == 0:
            articles_dict.pop(cust_ID)
    num_customers = len(df_customer)
    print(f"{num_customers = }")
    similarity_matrix = np.zeros((num_customers, num_customers))
    # Iterate over customers:
    for r, cust in enumerate(articles_dict.keys()):
        for c, second in enumerate(articles_dict.keys()):
            sim = position_indep_jaccard(articles_dict[cust], articles_dict[second])
            similarity_matrix[r, c] = sim

    return (
        pd.DataFrame(
            similarity_matrix, index=articles_dict.keys(), columns=articles_dict.keys()
        ),
        articles_dict,
    )


def get_recommendation(
    similarity_matrix: pd.DataFrame,
    articles_dict: dict,
    customer_ID: str,
    article_ID: int,
    k: int,
) -> float:
    """Produce recommendation score of an item based on its k closest customer behaviors

    Args:
        similarity_matrix (pd.DataFrame): nxn matrix of similarities between customers
        articles_dict (dict): Dictionary of customer purchases on form {customer_id: [item1, item2, ...]}
        customer_ID (str): The customer the recommendation score is based on
        article_ID (int): The article the score is based on
        k (int): How many (closest) customer-neighbors to include in computation.

    Returns:
        float: Measure of how well the item would fit the customer in question, between [0,1]
    """
    # The k most similar customers IDs:
    closest_customers = (
        similarity_matrix[customer_ID].sort_values(ascending=False)[:k].index
    )
    return (
        sum(1 if article_ID in articles_dict[cust] else 0 for cust in closest_customers)
        / k
    )

In [None]:
# Load sample data
df_cust = pd.read_csv("dataset_sample/customer_min.csv")
df_tr = pd.read_csv("dataset_sample/transactions_min.csv")
df_art = pd.read_csv("dataset_sample/articles_min.csv")
sim_matr, art_dict = find_customer_similarity(df_cust, df_tr)

In [None]:
def get_n_recommendations(
    n: int,
    similarity_matrix: pd.DataFrame,
    articles_dict: dict,
    customer_ID: str,
    k: int,
    ignore_purchased: bool = True,
) -> list:
    """Get the n 'best' recommended items for a specific customer ID

    Args:
        n (int): How many items to recommend
        similarity_matrix (pd.DataFrame): Customer similarity matrix
        articles_dict (dict): Dictionary of customer purchases on form {customer_id: [item1, item2, ...]}
        customer_ID (str): _description_
        k (int): _description_
        ignore_purchased (bool, optional): _description_. Defaults to True.

    Returns:
        list: _description_
    """
    # Get rec. score for all cases and choose n with highest score
    # ignore_purchased to ignore those articles customer has already bought
    blacklisted_articles = (
        set(articles_dict[customer_ID]) if ignore_purchased else set()
    )
    art_IDs = set(itertools.chain(*articles_dict.values())) - blacklisted_articles
    score_dict = {
        art_ID: get_recommendation(
            similarity_matrix, articles_dict, customer_ID, art_ID, k
        )
        for art_ID in art_IDs
    }
    n_best_items = {
        k: v for k, v in sorted(score_dict.items(), key=lambda el: el[1], reverse=True)
    }
    # Return entire dict for debug purposes, but otherwise just the article IDs (not scores)
    return list(itertools.islice(n_best_items.items(), n))
    return list(n_best_items.keys())[:n]

In [None]:
get_n_recommendations(
    n=5,
    similarity_matrix=sim_matr,
    articles_dict=art_dict,
    customer_ID="008068b49b6bdd622ed406e30c8603270770174ebf300dbac0f5beac522921e0",
    k=5,
)

In [None]:

get_recommendation(
    similarity_matrix=sim_matr,
    articles_dict=art_dict,
    customer_ID='008068b49b6bdd622ed406e30c8603270770174ebf300dbac0f5beac522921e0',
    article_ID=556255001,
    k=5
)


In this case, two of the $k$ closest customers (including the customer itself) has bought the article in question. Thus we get a score of $\frac25=0.4$

In [2]:
## Playin around with better methods
import numpy as np, pandas as pd

transactions = pd.read_csv("dataset/transactions_train.csv", dtype={"article_id": str}, usecols=["customer_id", "article_id"])
transactions['rating'] = 1 # All transactions have happened lol


In [5]:
transactions_test = transactions.head()
transactions_test
# from scipy.spatial.distance import pdist, squareform
# squareform(pdist(transactions_test, metric='correlation'))

Unnamed: 0,customer_id,article_id,rating
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,1
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,1
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,1
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,1
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,1


In [47]:
similarity_matrix = (pd.pivot_table(
    transactions.head(100),
    values="rating",
    index="customer_id",
    columns="article_id")
    .T
    .fillna(0)
    .corr()
    )
similarity_matrix

customer_id,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,001127bffdda108579e6cb16080440e89bf1250a776c6e55f56e35e9ee029a8d,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8adc73323aebf571fd8,001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61,0021da829b898f82269fc51feded4eac2129058ee95bd75bb1591e2eb14ecc79,00228762ecff5b8d1ea6a2e52b96dafa198febddbc3bf350eb611f28e61ea6ce,...,0051a1d00ec9a4ff1a2f9aa32a4bb054cc9ba08e80be590aec39391ff3dc56e8,005c9fb2ba6c49b2098a662f64a9124ef95cbec5fcf4ebdb4dcbaaf83f979c51,00609a1cc562140fa87a6de432bef9c9f0b936b259ad3075eb2a65008df1dbab,00708c3da4d07706d4cad77c6aecc1b1ce33d21d73022ca12737c91d85bff070,0074c5948b6c96e7522f6f5c034b46cf08875b9a81c557a02bda0aedac355547,00761aefe07a2dd6ca110c99f3856ede55f20ff00ce754bf24c422c6746d05f3,0077ba345ef6aa8781802107df25bbb8a14fdcd04130c3ffa5cd1ac4087ad8ab,00796ce0bc561897e7047a7b059867aa6424f63ec597e1fbb3a195e48d5f452b,007e4c2e0994b4aac97c2713d5dc0b7497d74113f32446868ca22b6437e9594d,007ea762fbbebfad288a49f1d8f7c2c0c62b189a8816fa6968e60f2c197ce938
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,1.0,-0.040811,-0.040811,-0.025316,-0.075853,-0.017789,-0.025316,-0.025316,-0.036265,-0.017789,...,-0.017789,-0.036265,-0.040811,-0.031204,-0.017789,-0.031204,-0.017789,-0.017789,-0.017789,-0.017789
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,-0.040811,1.0,-0.065789,-0.040811,-0.122279,-0.028677,-0.040811,-0.040811,-0.058461,-0.028677,...,-0.028677,-0.058461,-0.065789,-0.050303,-0.028677,-0.050303,-0.028677,-0.028677,-0.028677,-0.028677
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,-0.040811,-0.065789,1.0,-0.040811,-0.122279,-0.028677,-0.040811,-0.040811,-0.058461,-0.028677,...,-0.028677,-0.058461,-0.065789,-0.050303,-0.028677,-0.050303,-0.028677,-0.028677,-0.028677,-0.028677
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,-0.025316,-0.040811,-0.040811,1.0,-0.075853,-0.017789,-0.025316,-0.025316,-0.036265,-0.017789,...,-0.017789,-0.036265,-0.040811,-0.031204,-0.017789,-0.031204,-0.017789,-0.017789,-0.017789,-0.017789
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,-0.075853,-0.122279,-0.122279,-0.075853,1.0,-0.0533,-0.075853,-0.075853,-0.108657,-0.0533,...,-0.0533,-0.108657,-0.122279,-0.093495,-0.0533,-0.093495,-0.0533,-0.0533,-0.0533,-0.0533
001127bffdda108579e6cb16080440e89bf1250a776c6e55f56e35e9ee029a8d,-0.017789,-0.028677,-0.028677,-0.017789,-0.0533,1.0,-0.017789,-0.017789,-0.025482,-0.0125,...,-0.0125,-0.025482,-0.028677,-0.021926,-0.0125,-0.021926,-0.0125,-0.0125,-0.0125,-0.0125
001ea4e9c54f7e9c88811260d954edc059d596147e1cf8adc73323aebf571fd8,-0.025316,-0.040811,-0.040811,-0.025316,-0.075853,-0.017789,1.0,-0.025316,-0.036265,-0.017789,...,-0.017789,-0.036265,-0.040811,-0.031204,-0.017789,-0.031204,-0.017789,-0.017789,-0.017789,-0.017789
001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61,-0.025316,-0.040811,-0.040811,-0.025316,-0.075853,-0.017789,-0.025316,1.0,-0.036265,-0.017789,...,-0.017789,-0.036265,-0.040811,-0.031204,-0.017789,-0.031204,-0.017789,-0.017789,-0.017789,-0.017789
0021da829b898f82269fc51feded4eac2129058ee95bd75bb1591e2eb14ecc79,-0.036265,-0.058461,-0.058461,-0.036265,-0.108657,-0.025482,-0.036265,-0.036265,1.0,-0.025482,...,-0.025482,-0.051948,-0.058461,-0.044699,-0.025482,-0.044699,-0.025482,-0.025482,-0.025482,-0.025482
00228762ecff5b8d1ea6a2e52b96dafa198febddbc3bf350eb611f28e61ea6ce,-0.017789,-0.028677,-0.028677,-0.017789,-0.0533,-0.0125,-0.017789,-0.017789,-0.025482,1.0,...,-0.0125,-0.025482,-0.028677,-0.021926,-0.0125,-0.021926,-0.0125,-0.0125,-0.0125,-0.0125


In [38]:
import numpy as np
samples = 100
ratings_test = pd.DataFrame(
    {
        "customer_id": [chr(x) for x in np.random.randint(65,90,samples)],
        "article_id": [num for num in np.random.randint(1,10,samples)]
    }
)
ratings_test["rating"] = 1
ratings_test

Unnamed: 0,customer_id,article_id,rating
0,V,3,1
1,U,7,1
2,L,7,1
3,G,6,1
4,P,7,1
...,...,...,...
95,A,3,1
96,T,6,1
97,L,3,1
98,M,1,1


In [62]:
from scipy.spatial.distance import pdist, squareform
ratings_test[["customer_id", "article_id"]].groupby(["customer_id", "article_id"]).count()

customer_id,article_id
A,3
A,8
B,6
B,9
C,7
...,...
Y,2
Y,3
Y,4
Y,5
