# Data walking through

In [1]:
import pandas as pd 
import numpy as np

In [2]:
transac = pd.read_csv("dataset/ratebeer/transac.csv", header=None)
columns = ["GiverID", "ReceiverID", "itemID", "timestamp"]
transac.columns = columns
print(transac)

Unnamed: 0,GiverID,ReceiverID,itemID,timestamp
0,92478,68245,350701,1314860400
1,11299,163679,151172,1362124800
2,11259,9554,7688,1178002800
3,11259,9554,355163,1178002800
4,128373,10288,135888,1354348800
...,...,...,...,...
125660,15110,28921,32792,1296547200
125661,15110,28921,244384,1309503600
125662,15110,28921,329904,1314860400
125663,28921,15110,148479,1362124800


In [3]:
number_of_row = len(transac)
# get number of products
number_of_product = len(np.unique(transac["itemID"]))

transac_grouped = transac.groupby(["GiverID", "ReceiverID", "timestamp"]).aggregate(lambda x: str(list(np.unique(x))))
print(transac_grouped)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemID
GiverID,ReceiverID,timestamp,Unnamed: 3_level_1
39,5387,1064991600,[624]
39,6250,1072944000,[312514]
39,6250,1088665200,"[11221, 18543, 49666, 342736]"
39,7034,1196496000,"[12944, 13618, 191628, 385120]"
39,7084,1125558000,[37652]
...,...,...,...
377816,371705,1438412400,[151172]
389912,336572,1446361200,[369168]
391561,132930,1454313600,"[244595, 398498]"
395626,364565,1451635200,[42012]


In [6]:
transac_grouped = transac_grouped.drop_duplicates()
number_of_transac = len(transac_grouped)
print(transac_grouped)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemID
GiverID,ReceiverID,timestamp,Unnamed: 3_level_1
39,5387,1064991600,[624]
39,6250,1072944000,[312514]
39,6250,1088665200,"[11221, 18543, 49666, 342736]"
39,7034,1196496000,"[12944, 13618, 191628, 385120]"
39,7084,1125558000,[37652]
...,...,...,...
367429,24249,1433142000,"[165693, 192606]"
367429,45094,1433142000,"[117631, 344237]"
367573,98012,1443682800,"[199531, 263601]"
377816,110596,1438412400,"[151172, 225178]"


In [7]:
_index = [i for i in range(len(transac_grouped))]
transac_grouped.index = _index
print(transac_grouped)

Unnamed: 0,itemID
0,[624]
1,[312514]
2,"[11221, 18543, 49666, 342736]"
3,"[12944, 13618, 191628, 385120]"
4,[37652]
...,...
31619,"[165693, 192606]"
31620,"[117631, 344237]"
31621,"[199531, 263601]"
31622,"[151172, 225178]"


In [8]:
for i in range(len(transac_grouped)):
    transac_grouped.iloc[i][0] = list(eval(transac_grouped.iloc[i][0]))
print(transac_grouped)

Unnamed: 0,itemID
0,[624]
1,[312514]
2,"[11221, 18543, 49666, 342736]"
3,"[12944, 13618, 191628, 385120]"
4,[37652]
...,...
31619,"[165693, 192606]"
31620,"[117631, 344237]"
31621,"[199531, 263601]"
31622,"[151172, 225178]"


In [18]:
item_values = np.array([])
for i in range(len(transac_grouped)):
    item_values = np.hstack((item_values, transac_grouped.iloc[i][0]))
item_values = np.unique(item_values).astype(int)

In [20]:
map_col, rmap_col = {}, {}
idx, _columns = 0, []
for val in item_values:
    map_col[val] = idx
    rmap_col[idx] = val
    idx += 1

In [21]:
for i in range(len(transac_grouped)):
    transac_grouped.iloc[i][0] = [map_col[val] for val in transac_grouped.iloc[i][0]]
print(transac_grouped)

Unnamed: 0,itemID
0,[199]
1,[16763]
2,"[1981, 2692, 4665, 17400]"
3,"[2178, 2266, 13155, 18280]"
4,[3923]
...,...
31619,"[12105, 13195]"
31620,"[9475, 17423]"
31621,"[13486, 15519]"
31622,"[11399, 14374]"


In [22]:
def transactionEncoder(df):
    # 'transactions' is now temporary variable
    transactions = [row["itemID"] for index, row in df.iterrows()]
    from mlxtend.preprocessing import TransactionEncoder

    transaction_encoder = TransactionEncoder()
    transac_matrix = transaction_encoder.fit_transform(transactions).astype("int")
    transac_df = pd.DataFrame(transac_matrix, columns=transaction_encoder.columns_)

    return transac_df, transac_matrix

In [23]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

transac_df, transac_matrix = transactionEncoder(transac_grouped)

print(transac_df.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18747,18748,18749,18750,18751,18752,18753,18754,18755,18756
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
print("Number of row:", number_of_row)
print("Number of product:", number_of_product)
print("Number of transac:", number_of_transac)

Number of row: 125665
Number of product: 18757
Number of transac: 31624


In [25]:
print(np.unique(transac_df.sum(), return_counts=True))

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 106, 107,
        108, 109, 112, 113, 114, 115, 116, 118, 119, 122, 123, 126, 130,
        132, 133, 134, 135, 136, 140, 142, 143, 145, 147, 152, 154, 155,
        156, 161, 162, 165, 169, 171, 174, 175, 176, 179, 180, 181, 183,
        184, 190, 193, 196, 201, 203, 208, 211, 212, 215, 220, 226, 227,
        229, 232, 235, 240, 251, 252, 282, 302, 303, 304, 305, 328, 349,
        371, 430, 449, 469, 505, 555]),
 array([844

In [86]:
from scipy.stats.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

def to_row_df(items, _columns):
    row_df = pd.DataFrame(data=[np.zeros(len(_columns)).astype(int)], columns=_columns)
    for i in items:
        row_df[i] = 1
    return row_df

def predict (utility_df, utility_matrix, test_items, return_num=5):
    sim_items = []
    for i in range(utility_matrix.shape[0]):
        corr = pearsonr(utility_df.values[0], utility_matrix[i])[0]
        sim_items.append(corr)

    
    for j in test_items:
        tu = mau = 0
        for i in range(utility_matrix.shape[0]):
            tu += sim_items[i] * (utility_matrix[i][j] - np.mean(utility_matrix[i]))
            mau += sim_items[i]
        pred = np.mean(utility_df.values[0]) + tu/mau
        if pred > 0.5:
            return 1
    return 0

In [28]:
pivot = int(0.7*len(transac_grouped))
# print(pivot)
train_set = transac_grouped[:pivot]
test_set  = transac_grouped[pivot:]
print(len(train_set))
print(len(test_set))


22136
9488


In [88]:
def in_train_lst(lst, _columns):
    for i in lst:
        if i not in _columns or i > len(_columns):
            return False
    return True

def givenN_evaluate(train, test, given_num):
    import time
    start_time = time.time()
    train_df, train_matrix = transactionEncoder(train)
    score = cnt = 0
    for i in range(len(test)):
        lst = test.iloc[i][0]
        if len(lst) <= given_num or not in_train_lst(lst, train_df.columns):
            continue
        given_items = lst[:-given_num]
        test_items = lst[-given_num:]
        row_df = to_row_df(test_items, train_df.columns)
        _start_time = time.time()
        score += predict(row_df, train_matrix, test_items, 10)
        cnt += 1
        print("{}/{} - score = {} - time = {}".format(cnt, i+1, score, time.time() - _start_time))
        if cnt == 10:
            break
    print(time.time() - start_time)
    print('cnt =', cnt)
    return score

score = givenN_evaluate(train_set, test_set, 1) #given 1
print('score =', score)

1/1 - score = 0 - time = 7.0470662117004395
2/4 - score = 0 - time = 6.0130250453948975
3/7 - score = 0 - time = 5.47003698348999
4/8 - score = 0 - time = 5.877780914306641
5/12 - score = 0 - time = 5.945812940597534
6/14 - score = 0 - time = 5.79262900352478
7/16 - score = 0 - time = 5.5284178256988525
8/18 - score = 0 - time = 5.683437824249268
9/19 - score = 0 - time = 6.164126873016357
10/21 - score = 0 - time = 5.478986978530884
77.11662888526917
cnt = 10
score = 0
