# Data walking through

In [1]:
import pandas as pd 
import numpy as np

# read csv and assign columns
columns = ["personId", "productId"]
df = pd.read_csv("dataset/ratebeer/wish.csv", header=None)
df.columns = columns

number_of_row = len(df)
# get number of products
number_of_product = len(np.unique(df["productId"]))
# get number of person
number_of_person = len(np.unique(df["personId"]))
df = df.groupby("personId").aggregate(lambda x: list(x))

In [2]:
# df = df.drop_duplicates()
# print(df)

In [3]:
print("Number of row:", number_of_row)
print("Number of person:", number_of_person)
print("Number of product:", number_of_product)
# first samples
df.head()

Number of row: 118453
Number of person: 1953
Number of product: 21654


Unnamed: 0_level_0,productId
personId,Unnamed: 1_level_1
39,"[47436, 1147, 64658, 9448, 12227, 10609, 6890,..."
63,"[36407, 53647, 36624]"
69,"[15017, 2491, 6924, 9193, 19633, 13730, 64975,..."
154,"[49774, 28173, 15017, 24350, 565, 22339, 17385..."
181,"[10524, 30591, 7183, 38426, 22537, 48679, 2912..."


In [4]:
# for i in range(len(df)):
#     df.iloc[i][0] = list(eval(df.iloc[i][0]))
# print(df)

In [5]:
# 'transactions' is now temporary variable
transactions = [row["productId"] for index, row in df.iterrows()]

In [6]:
from mlxtend.preprocessing import TransactionEncoder

transaction_encoder = TransactionEncoder()
wish_matrix = transaction_encoder.fit_transform(transactions).astype("int")
wish_df = pd.DataFrame(wish_matrix, columns=transaction_encoder.columns_)

# index is actually personId
wish_df.index = df.index 

In [7]:
wish_df.head()

Unnamed: 0_level_0,1,2,3,9,10,13,14,15,16,17,...,390987,391005,391163,391340,391802,392092,392165,392402,392478,393042
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
np.unique(wish_df.sum(), return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  91,  92,
         93,  94,  95,  96,  98,  99, 100, 101, 102, 103, 105, 106, 107,
        109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
        122, 123, 124, 125, 126, 128, 131, 132, 134, 135, 136, 137, 140,
        141, 142, 143, 144, 145, 146, 148, 149, 151, 153, 154, 158, 160,
        161, 164, 165, 167, 169, 170, 173, 174, 175, 178, 180, 183, 184,
        185, 188, 189, 194, 197, 199, 201, 202, 203, 205, 207, 210, 221,
        228, 231, 233, 240, 242, 246, 248, 254, 300

# Erase useless samples, which are so sparse 

In [9]:
print("Number of person having wish > 1:", len(wish_df.sum(axis=1) > 1))
new_df = wish_df.loc[wish_df.sum(axis=1) > 1]
# new_df = new_df.loc[:, wish_df.sum() > 1]

Number of person having wish > 1: 1953


In [10]:
print("Training dataset shape:", new_df.shape)
new_df.head()

Training dataset shape: (1840, 21654)


Unnamed: 0_level_0,1,2,3,9,10,13,14,15,16,17,...,390987,391005,391163,391340,391802,392092,392165,392402,392478,393042
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
person_to_products = df.loc[new_df.index]

# Transaction-based wish prediction using apriori

In [11]:
# from mlxtend.frequent_patterns import apriori
# from mlxtend.frequent_patterns import association_rules

# frequent_itemsets = apriori(new_df, min_support=0.05)
# rules = association_rules(frequent_itemsets, metric="lift")
# # Something FAILED
# rules

In [12]:
# len(rules)

In [13]:
# print("Number of antecedents inferred:", len(np.unique(rules["antecedents"])))
# print("Ratio of antecedents / well-known products:", len(np.unique(rules["antecedents"])) / number_of_product)

# User-based prediction using cosine similarity

In [15]:
sims = cosine_similarity(X_train.T, X_train.T)
sims.shape

(21654, 21654)

### Make a new df cotaining similarity and headers

In [16]:
sims_df = pd.DataFrame(sims)
sims_df.columns = new_df.columns
sims_df.index  = new_df.columns
sims_df

Unnamed: 0,1,2,3,9,10,13,14,15,16,17,...,390987,391005,391163,391340,391802,392092,392165,392402,392478,393042
1,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.000000,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.707107,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392092,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
392165,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
392402,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
392478,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

X_train, X_test = train_test_split(new_df, test_size=1/3.0, random_state=1000)
print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 1226
Test size: 614


# Evaluation metrics: All but one

In [18]:
score = 0
for _, sample in X_test.iterrows():
    test_product_id = person_to_products.loc[sample.name]["productId"][0] # sample.index
    dot_prod = np.dot(sample, sims_df.loc[test_product_id]) 
    sum_sims = sims_df.loc[test_product_id].sum()
    if sum_sims < 1e-6: 
        continue 
    pred = dot_prod / sum_sims
    score = score + (1 if pred > 0 else 0)

print("Accuracy: %.2f%%" % (score / len(X_test) * 100))

Accuracy: 95.28%


In [23]:

def give_recommendations(user_sample, return_num):
    result = []
    sample = [0] * len(wish_df.columns)
    user_product_set = set(user_sample)
    for product_id in user_sample:
        sample[product_id] = 1
    
    for test_product_id in wish_df.columns:
        # skip known product
        if test_product_id in user_product_set:
            continue
            
        # sum(user_sample[i] * sim(test_product_id, i))
        dot_prod = np.dot(sample, sims_df.loc[test_product_id]) 
        # sum(sim(test_product_id, i))
        sum_sims = sims_df.loc[test_product_id].sum() - 1
        if sum_sims < 1e-6: 
            continue

        pred = dot_prod / sum_sims
        if pred > 0:
            result.append((pred, test_product_id))
    return sorted(result)[::-1][:return_num]
give_recommendations([15], 10)

[(0.009860054484450329, 65951),
 (0.010815130229429431, 11312),
 (0.0645959470549158, 50),
 (0.0645959470549158, 5948),
 (0.0645959470549158, 23177),
 (0.0645959470549158, 24448),
 (0.0645959470549158, 32737),
 (0.0645959470549158, 39201),
 (0.0645959470549158, 44668),
 (0.0645959470549158, 49728)]

## 5-Fold Evaluation

In [39]:
from sklearn.model_selection import KFold

n_splits, mean_acc = 10, 0
kf = KFold(n_splits=n_splits, random_state=2021, shuffle=True)
idx = 0
for train_index, test_index in kf.split(new_df):
    idx += 1
    X_train, X_test = new_df.iloc[train_index], new_df.iloc[test_index]
    # print("Train size:", len(X_train))
    # print("Test size:", len(X_test))
    score = 0
    for _, sample in X_test.iterrows():
        test_product_id = person_to_products.loc[sample.name]["productId"][0] # sample.index
        dot_prod = np.dot(sample, sims_df.loc[test_product_id]) 
        sum_sims = sims_df.loc[test_product_id].sum()
        if sum_sims < 1e-6: 
            continue 
        pred = dot_prod / sum_sims
        score = score + (1 if pred > 0 else 0)
    acc = score / len(X_test) * 100
    print("%2d-Accuracy: %.2f%%" % (idx, acc))
    mean_acc += acc
    # s = ""
    # for i in range(20):
    #     s += "#"
    # print(s)
print("Mean Accuracy: %.2f%%" % (mean_acc/n_splits))

 1-Accuracy: 98.91%
 2-Accuracy: 98.37%
 3-Accuracy: 98.37%
 4-Accuracy: 97.83%
 5-Accuracy: 98.91%
 6-Accuracy: 98.37%
 7-Accuracy: 98.37%
 8-Accuracy: 97.83%
 9-Accuracy: 98.37%
10-Accuracy: 98.91%
Mean Accuracy: 98.42%
