# Data walking through

In [1]:
import pandas as pd 
import numpy as np

# read csv and assign columns
columns = ["personId", "productId"]
df = pd.read_csv("dataset/ratebeer/wish.csv", header=None)
df.columns = columns

number_of_row = len(df)
# get number of products
number_of_product = len(np.unique(df["productId"]))
# get number of person
number_of_person = len(np.unique(df["personId"]))
df = df.groupby("personId").aggregate(lambda x: list(x))

In [2]:
print("Number of row:", number_of_row)
print("Number of person:", number_of_person)
print("Number of product:", number_of_product)
# first samples
df.head()

Number of row: 118453
Number of person: 1953
Number of product: 21654


Unnamed: 0_level_0,productId
personId,Unnamed: 1_level_1
39,"[47436, 1147, 64658, 9448, 12227, 10609, 6890,..."
63,"[36407, 53647, 36624]"
69,"[15017, 2491, 6924, 9193, 19633, 13730, 64975,..."
154,"[49774, 28173, 15017, 24350, 565, 22339, 17385..."
181,"[10524, 30591, 7183, 38426, 22537, 48679, 2912..."


In [14]:
# 'transactions' is now temporary variable
transactions = [row["productId"] for index, row in df.iterrows()]

In [15]:
from mlxtend.preprocessing import TransactionEncoder

transaction_encoder = TransactionEncoder()
wish_matrix = transaction_encoder.fit_transform(transactions).astype("int")
wish_df = pd.DataFrame(wish_matrix, columns=transaction_encoder.columns_)

In [16]:
wish_df.head()

Unnamed: 0,1,2,3,9,10,13,14,15,16,17,...,390987,391005,391163,391340,391802,392092,392165,392402,392478,393042
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
np.unique(wish_df.sum(), return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  91,  92,
         93,  94,  95,  96,  98,  99, 100, 101, 102, 103, 105, 106, 107,
        109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
        122, 123, 124, 125, 126, 128, 131, 132, 134, 135, 136, 137, 140,
        141, 142, 143, 144, 145, 146, 148, 149, 151, 153, 154, 158, 160,
        161, 164, 165, 167, 169, 170, 173, 174, 175, 178, 180, 183, 184,
        185, 188, 189, 194, 197, 199, 201, 202, 203, 205, 207, 210, 221,
        228, 231, 233, 240, 242, 246, 248, 254, 300

# Transaction-based wish prediction using apriori

In [20]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(wish_df, min_support=0.1)
rules = association_rules(frequent_itemsets, metric="lift")
# Something FAILED
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


# User-based prediction using cosine similarity

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(wish_df[1:2], wish_matrix[:])[0][:200]

array([0.06154575, 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.06900656, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.10910895, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.0745356 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0695048 , 0.        , 0.        , 0.06415003,
       0.        , 0.        , 0.        , 0.02412968, 0.        ,
       0.        , 0.        , 0.07231015, 0.        , 0.08084521,
       0.06851887, 0.        , 0.        , 0.        , 0.19245009,
       0.        , 0.08084521, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

# Build cosine similarity

In [19]:
a = [
    [1, 0, 1, 0],
    [0, 1, 1, 0],
    [0, 1, 0, 1],
    [1, 1, 0, 1]
]
df_a = pd.DataFrame(a)
df_a

Unnamed: 0,0,1,2,3
0,1,0,1,0
1,0,1,1,0
2,0,1,0,1
3,1,1,0,1


In [58]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

data = df_a.iloc[:3]
sims = cosine_similarity(data.T, data.T)
print("Similarity:")
print(sims)
def predict_rest(prod_count, sims, user):
    known_count = len(user)
    result = []
    for i in range(known_count, prod_count):
        result.append(np.dot(user, sims[i, :known_count])/sum(sims[i, :known_count]))
    return result
predict_rest(4, sims, df_a.iloc[3, :2])
# prediction:
# np.dot()


Similarity:
[[1.         0.         0.70710678 0.        ]
 [0.         1.         0.5        0.70710678]
 [0.70710678 0.5        1.         0.        ]
 [0.         0.70710678 0.         1.        ]]


[1.0, 1.0]