# Data walking through

In [1]:
import pandas as pd 
import numpy as np

# read csv and assign columns
columns = ["personId", "productId"]
df = pd.read_csv("dataset/ratebeer/wish.csv", header=None)
df.columns = columns

number_of_row = len(df)
# get number of products
number_of_product = len(np.unique(df["productId"]))
# get number of person
number_of_person = len(np.unique(df["personId"]))
df = df.groupby("personId").aggregate(lambda x: list(np.unique(x)))

In [2]:
print("Number of row:", number_of_row)
print("Number of person:", number_of_person)
print("Number of product:", number_of_product)
# first samples
df.head()

Number of row: 118453
Number of person: 1953
Number of product: 21654


Unnamed: 0_level_0,productId
personId,Unnamed: 1_level_1
39,"[464, 562, 736, 739, 740, 1147, 1360, 1618, 21..."
63,"[36407, 36624, 53647]"
69,"[2442, 2491, 3401, 6924, 9193, 13730, 15017, 1..."
154,"[565, 1432, 3213, 7686, 10691, 14286, 15017, 1..."
181,"[4108, 7183, 7979, 10524, 11008, 14082, 19445,..."


In [3]:
# 'transactions' is now temporary variable
transactions = [row["productId"] for index, row in df.iterrows()]

In [4]:
from mlxtend.preprocessing import TransactionEncoder

transaction_encoder = TransactionEncoder()
wish_matrix = transaction_encoder.fit_transform(transactions).astype("int")
wish_df = pd.DataFrame(wish_matrix, columns=transaction_encoder.columns_)

# index is actually personId
wish_df.index = df.index 

In [5]:
wish_df.head()

Unnamed: 0_level_0,1,2,3,9,10,13,14,15,16,17,...,390987,391005,391163,391340,391802,392092,392165,392402,392478,393042
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
np.unique(wish_df.sum(), return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  91,  92,
         93,  94,  95,  96,  98,  99, 100, 101, 102, 103, 105, 106, 107,
        109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
        122, 123, 124, 125, 126, 128, 131, 132, 134, 135, 136, 137, 140,
        141, 142, 143, 144, 145, 146, 148, 149, 151, 153, 154, 158, 160,
        161, 164, 165, 167, 169, 170, 173, 174, 175, 178, 180, 183, 184,
        185, 188, 189, 194, 197, 199, 201, 202, 203, 205, 207, 210, 221,
        228, 231, 233, 240, 242, 246, 248, 254, 300

# Erase useless samples, which are so sparse 

In [7]:
print("Number of person having wish > 1:", len(wish_df.sum(axis=1) > 1))
new_df = wish_df.loc[wish_df.sum(axis=1) > 1]
new_df = new_df.loc[:, wish_df.sum() > 0]

Number of person having wish > 1: 1953


In [8]:
print("Training dataset shape:", new_df.shape)
new_df.head()

Training dataset shape: (1840, 21654)


Unnamed: 0_level_0,1,2,3,9,10,13,14,15,16,17,...,390987,391005,391163,391340,391802,392092,392165,392402,392478,393042
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
person_to_products = df.loc[new_df.index]

# Transaction-based wish prediction using apriori

In [10]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(new_df, min_support=0.05)
rules = association_rules(frequent_itemsets, metric="lift")
# Something FAILED
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(776),(6441),0.111413,0.230978,0.051630,0.463415,2.006313,0.025896,1.433177
1,(6441),(776),0.230978,0.111413,0.051630,0.223529,2.006313,0.025896,1.144392
2,(5202),(1886),0.195109,0.129891,0.067935,0.348189,2.680621,0.042592,1.334910
3,(1886),(5202),0.129891,0.195109,0.067935,0.523013,2.680621,0.042592,1.687448
4,(6441),(2446),0.230978,0.129891,0.064130,0.277647,2.137534,0.034128,1.204548
...,...,...,...,...,...,...,...,...,...
173,(8766),(8324),0.113587,0.125000,0.053261,0.468900,3.751196,0.039062,1.647523
174,(15290),(14492),0.063587,0.107065,0.051630,0.811966,7.583843,0.044822,4.748790
175,(14492),(15290),0.107065,0.063587,0.051630,0.482234,7.583843,0.044822,1.808562
176,(15498),(14492),0.109783,0.107065,0.082065,0.747525,6.981957,0.070311,3.536722


In [11]:
len(rules)

178

In [12]:
print("Number of antecedents inferred:", len(np.unique(rules["antecedents"])))
print("Ratio of antecedents / well-known products:", len(np.unique(rules["antecedents"])) / number_of_product)

Number of antecedents inferred: 164
Ratio of antecedents / well-known products: 0.0075736584464764015


# User-based prediction using cosine similarity

# Evaluation metrics: All but one
## With K-Folds Cross-Validation

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold

n_splits, mean_acc = 10, 0
kf = KFold(n_splits=n_splits, random_state=2021, shuffle=True)
idx = 0
for train_index, test_index in kf.split(new_df):
    idx += 1
    X_train, X_test = new_df.iloc[train_index], new_df.iloc[test_index]
    sims = cosine_similarity(X_train.T, X_train.T)
    # Make a new df cotaining similarity and headers
    sims_df = pd.DataFrame(sims)
    sims_df.columns = new_df.columns
    sims_df.index  = new_df.columns

    # precompute a set for quick query
    valid_product_set = set(sims_df.index)

    # print("Train size:", len(X_train))
    # print("Test size:", len(X_test))
    # evaluate all - but - one metrics:
    # steps:
    # 1: Assume that first valid one is unknown
    # 2: Calculate prediction value for it
    # 3: Verdict: +1 to score if pred > THRESHOLD
    score = 0

    for _, sample in X_test.iterrows():
        ids = person_to_products.loc[sample.name]["productId"]
        test_product_id = -1
        for id in ids:
            if id in valid_product_set:
                test_product_id = id
                break
        # we never get the case where all are invalid !!!
        # assume that we didn't know that product is chosen
        dot_prod = np.dot(sample, sims_df.loc[test_product_id]) - 1
        sum_sims = sims_df.loc[test_product_id].sum() - 1 
        if sum_sims < 1e-6: 
            continue 
        pred = dot_prod / sum_sims
        score = score + (1 if pred > 0 else 0)
        # if score > 0:
        #     print(sample)
        #     break
    acc = score / len(X_test) * 100
    print("%4d-Accuracy: %.2f%%" % (idx, acc))
    mean_acc += acc

print("Mean Accuracy: %.2f%%" % (mean_acc/n_splits))

   1-Accuracy: 93.48%
   2-Accuracy: 92.39%
   3-Accuracy: 94.02%
   4-Accuracy: 92.93%
   5-Accuracy: 93.48%
   6-Accuracy: 94.57%
   7-Accuracy: 92.39%
   8-Accuracy: 93.48%
   9-Accuracy: 91.30%
  10-Accuracy: 93.48%
Mean Accuracy: 93.15%


In [15]:

def give_recommendations(user_sample, return_num):
    result = []
    sample = [0] * len(wish_df.columns)
    user_product_set = set(user_sample)
    for product_id in user_sample:
        sample[product_id] = 1    
    for test_product_id in wish_df.columns:
        # skip known product
        if test_product_id in user_product_set:
            continue
        # sum(user_sample[i] * sim(test_product_id, i))
        dot_prod = np.dot(sample, sims_df.loc[test_product_id]) - 1
        # sum(sim(test_product_id, i))
        sum_sims = sims_df.loc[test_product_id].sum() - 1
        if sum_sims < 1e-6: 
            continue
        pred = dot_prod / sum_sims
        if pred > 0:
            result.append((pred, test_product_id))
    # return sorted(result)[::-1][:return_num]
    return result

import time
print("Start looking for recommendations ...")

marker = time.time()
recommendations = give_recommendations([0], 10)
print("Looking done in", time.time()-marker, " seconds")

print("Recommended products (id, prediction value) = ", recommendations)

Start looking for recommendations ...
Looking done in 68.26798629760742  seconds
Recommended products (id, prediction value) =  [(9.593342523405447e-19, 1)]
