# Data walking through

In [1]:
import pandas as pd 
import numpy as np

In [2]:
transac = pd.read_csv("dataset/ratebeer/transac.csv", header=None)
columns = ["GiverID", "ReceiverID", "itemID", "timestamp"]
transac.columns = columns
transac

Unnamed: 0,GiverID,ReceiverID,itemID,timestamp
0,92478,68245,350701,1314860400
1,11299,163679,151172,1362124800
2,11259,9554,7688,1178002800
3,11259,9554,355163,1178002800
4,128373,10288,135888,1354348800
...,...,...,...,...
125660,15110,28921,32792,1296547200
125661,15110,28921,244384,1309503600
125662,15110,28921,329904,1314860400
125663,28921,15110,148479,1362124800


In [66]:
number_of_row = len(transac)
# get number of products
number_of_product = len(np.unique(transac["itemID"]))

transac_grouped = transac.groupby(["GiverID", "ReceiverID", "timestamp"]).aggregate(lambda x: list(np.unique(x)))
number_of_transac = len(transac_grouped)

transac_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemID
GiverID,ReceiverID,timestamp,Unnamed: 3_level_1
39,5387,1064991600,[624]
39,6250,1072944000,[312514]
39,6250,1088665200,"[11221, 18543, 49666, 342736]"
39,7034,1196496000,"[12944, 13618, 191628, 385120]"
39,7084,1125558000,[37652]


In [68]:
print("Number of row:", number_of_row)
print("Number of product:", number_of_product)
print("Number of transac:", number_of_transac)

Number of row: 125665
Number of product: 18757
Number of transac: 39082


In [69]:
def transactionEncoder(df):
    # 'transactions' is now temporary variable
    transactions = [row["itemID"] for index, row in df.iterrows()]
    from mlxtend.preprocessing import TransactionEncoder

    transaction_encoder = TransactionEncoder()
    transac_matrix = transaction_encoder.fit_transform(transactions).astype("int")
    transac_df = pd.DataFrame(transac_matrix, columns=transaction_encoder.columns_)

    return transac_df, transac_matrix

# Transaction-based wish prediction using apriori

In [None]:
np.unique(transac_df.sum(), return_counts=True)

In [74]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

transac_df, transac_matrix = transactionEncoder(transac_grouped)

frequent_itemsets = apriori(transac_df, min_support=0.2)
print(frequent_itemsets)
# rules = association_rules(frequent_itemsets, metric="confidence")
# # Something FAILED
# rules

Empty DataFrame
Columns: [support, itemsets]
Index: []


# User-based prediction using cosine similarity

In [75]:
from sklearn.metrics.pairwise import cosine_similarity

t = cosine_similarity(transac_df[1:2], transac_matrix[:])[0]

mx = 0
for i in t:
    if i < 1 and i > mx:
        mx = i
        print(i)
mx

0.4082482904638631
0.5773502691896258
0.7071067811865475


0.7071067811865475

In [None]:
arr = []
for i in range(len(t)):
    arr.append((t[i], i))

arr = sorted(arr)[::-1]
arr[:20]

In [31]:
def get_items(transac_idx):
    items = []
    for i in range(len(transac_df.loc[transac_idx])):
        if transac_df.loc[transac_idx].values[i]:
            items.append(transac_df.loc[transac_idx].index[i])
    return items

In [108]:
def to_row_df(items, _columns):
    row_df = pd.DataFrame(data=[np.zeros(len(_columns)).astype(int)], columns=_columns)
    for i in items:
        row_df[i] = 1
    return row_df

def predict (utility_df, utility_matrix):
    sim_items = cosine_similarity(utility_df, utility_matrix[:])[0]
    result = []
    for i in range(len(sim_items)):
        if 0.5 < sim_items[i] and sim_items[i] < 1.:
            result.append((sim_items[i], get_items(i)))
    result = sorted(result)[::-1][:5]
    return result

In [55]:
pivot = int(0.7*len(transac_grouped))
# print(pivot)
train_set = transac_grouped[:pivot]
test_set  = transac_grouped[pivot:]
print(len(train_set))
print(len(test_set))


27357
11725


In [112]:
def givenN_evaluate(train, test, given_num):
    train_df, train_matrix = transactionEncoder(train)

    score = 0
    for i in range(len(test)):
        lst = test.iloc[i][0]
        if len(lst) <= given_num: continue
        given_num = len(lst) - given_num
        given_items = lst[:given_num]
        test_items = lst[given_num:]
        suggests = predict(to_row_df(test_items, train_df.columns), train_matrix)
        break
        if len(suggests):
            for s in suggests:
                if s in test_items: score += 1

    return score

score = givenN_evaluate(train_set, test_set, 1) #given 1

[168683, 398498]
[168683] [398498]


In [59]:
test_set

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemID
GiverID,ReceiverID,timestamp,Unnamed: 3_level_1
71061,75285,1367391600,[352087]
71061,109139,1328083200,[127500]
71061,159355,1328083200,[400617]
71061,170314,1351753200,"[168683, 398498]"
71106,100287,1270105200,"[40189, 275816]"
...,...,...,...
377816,371705,1438412400,[151172]
389912,336572,1446361200,"[369168, 369168]"
391561,132930,1454313600,"[244595, 398498]"
395626,364565,1451635200,[42012]
