# Import data from json

In [150]:
import json
with open("dataset/tradesy.json", "r") as f:
    user_transactions = json.loads(f.read())

# Stat:
- See how many products
- Which fields, and which should be concerned

In [152]:
user_transactions[:3]

[{'uid': '1',
  'lists': {'sold': ['3', '2'], 'selling': [], 'want': [], 'bought': []}},
 {'uid': '2',
  'lists': {'sold': ['104', '103', '102'],
   'selling': [],
   'want': [],
   'bought': ['466', '459', '457', '449']}},
 {'uid': '3',
  'lists': {'sold': ['845', '833', '829'],
   'selling': [],
   'want': [],
   'bought': ['874',
    '861',
    '860',
    '857',
    '852',
    '850',
    '847',
    '842',
    '160',
    '143',
    '142',
    '141',
    '140',
    '139',
    '93',
    '88',
    '87',
    '84',
    '81',
    '70',
    '66',
    '63',
    '61',
    '60']}}]

In [153]:
print("Number of users:", len(user_transactions))

Number of users: 128152


In [154]:
print("Fields of each user records:", ", ".join(user_transactions[0].keys()))
print("Fields of each tx lists:", ", ".join(user_transactions[0]["lists"].keys()))

Fields of each user records: uid, lists
Fields of each tx lists: sold, selling, want, bought


In [155]:
import numpy as np

def get_unique_items(user_transactions):
    item_ids = []
    for tx in user_transactions:
        if not len(tx["lists"]["want"]):
            continue
        item_ids.extend(tx["lists"]["want"])
        item_ids.extend(tx["lists"]["selling"])
    unique_ids = np.unique(item_ids, return_counts=True)
    return unique_ids

unique_ids, counts = get_unique_items(user_transactions)
print("Unique items:", len(unique_ids))
print("=> Size to store: ~", len(unique_ids) * len(user_transactions) / 1024 / 1024, "MB, > 8 GB RAM")

Unique items: 227856
=> Size to store: ~ 27847.482788085938 MB, > 8 GB RAM


In [156]:
print(unique_ids)

['1000001' '1000003' '1000007' ... '999991' '999992' '999998']


# Recommender system focus on "sold"


In [238]:
# Transform data: transaction jsons => table
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder 

userToSold = {}
focus_transactions = user_transactions
sold_list = []
for tx in focus_transactions:
    if len(tx["lists"]["sold"]) == 0:
        continue 
    userToSold[tx["uid"]] = list(map(int, tx["lists"]["sold"]))
    userToSold[tx["uid"]] = list(map(int, tx["lists"]["bought"]))
    sold_list.append(userToSold[tx["uid"]])

# encoder = TransactionEncoder()
# sold_list_matrix = encoder.fit_transform(sold_list).astype("int")

# df = pd.DataFrame(sold_list_matrix, columns=encoder.columns_)

In [243]:
print(len(sold_list))


17489


In [43]:
from mlxtend.frequent_patterns import apriori, association_rules
fq_itemset = apriori(df, min_support=0.1, use_colnames=True)

print(fq_itemset)
print("Number of rules:", len(fq_itemset))
# rules = association_rules(fq_itemset, metric='lift')
# print(rules)

Empty DataFrame
Columns: [support, itemsets]
Index: []
Number of rules: 0


In [131]:
from sklearn.metrics.pairwise import cosine_similarity
new_item = df[56]
# print(new_item)
sim = cosine_similarity([new_item], df.T)
print(sim)
print(len(sim[0]))
sim = pd.Series(sim[0], index=df.columns)
sim_sorted = sim.sort_values(ascending=False)[:5]
print(sim_sorted.index)


[[0. 1. 0. ... 0. 0. 0.]]
7735
Int64Index([682, 767, 814, 810, 807], dtype='int64')


In [249]:
def predict (item, utility_matrix):
    sim_items = cosine_similarity([item], utility_matrix.T)
    sim_items = pd.Series(sim_items[0], index=utility_matrix.columns)
    recommend = sim_items.sort_values(ascending=False)
    result = []
    for item, cosine in recommend.iteritems():
        if cosine > 0:
            result.append(item)
    return result
    


In [251]:
print(len(sold_list))
new = []
for s in sold_list:
    if s:
        new.append(s)
print(len(new))

pivot = int(0.7*len(new))
# print(pivot)
train_set = new[:pivot]
test_set  = new[pivot:]
print(len(train_set))
print(len(test_set))

def givenN_evaluate(train, test, given_num):
    encoder = TransactionEncoder()
    utility_matrix = encoder.fit_transform(train)
    utility_matrix = pd.DataFrame(utility_matrix, columns=encoder.columns_).astype(int)
    # print(utility_matrix.shape)
    # print(utility_matrix.columns)
    # print(utility_matrix)
    score = 0
    for i, t in enumerate(test):
        if len(t) <= given_num: continue 
        items = t[:given_num]
        # print(items)
        suggests = set()
        for i in items:
            if i not in utility_matrix.columns: continue
            suggests.update(predict(utility_matrix[i], utility_matrix))
        
        if suggests:  
            print('\nuser {}: {}'.format(i, t))  
            print('--------->recommend from{}: {}'.format(items, suggests))
    
        for s in suggests:
            if s in t[given_num:]: score += 1

    return score

score = givenN_evaluate(train_set, test_set, 1) #given 1
print(score)

    

17489
3355
2348
1007

userid 683106: [683106, 1453593, 1453585, 1108462, 829993, 707030]
--------->recommend from[683106]: {745952, 683106, 141317, 1343720, 778089, 40458, 774317, 390640, 779506, 899258, 731357, 164511}

userid 698251: [698251, 407686]
--------->recommend from[698251]: {785289, 261514, 698251, 808719, 334752, 1249185, 398882, 769828, 265388, 345905, 181044, 809019, 747330, 254153, 114250, 412495, 1456088, 118108, 890214, 125031, 735846, 751863, 405246}

userid 1354852: [1354852, 1465949, 1314702, 1235765, 1228222, 1227904, 1215486, 1213877, 1180868, 1179223, 1179161, 1080923, 1049035, 695855]
--------->recommend from[1354852]: {1013760, 1195396, 1462661, 1442950, 1533067, 1462033, 1507226, 1206300, 1453728, 1274017, 1462051, 1527209, 1398314, 1473963, 1241902, 1378992, 1371824, 1444538, 855101, 1517374, 1476160, 1163358, 1378916, 1354852, 1465958, 1268585, 513771, 1167214, 1465972, 1195384, 1287931, 1556860}

userid 395086: [395086, 1426677, 1210001, 1179898, 1145309, 