# Sample Code

## 基礎建設

In [27]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [28]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-08 10:14:35--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.21’


2022-01-08 10:14:36 (17.9 MB/s) - ‘All_Beauty.csv.21’ saved [15499476/15499476]

--2022-01-08 10:14:37--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.21’


2022-01-08 10:14:37 (14.3 MB/s) - ‘meta_All_Beauty.json.gz.21’ saved [10329961/10329961]



In [29]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [30]:
metadata.head()


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [31]:
ratings.head()


Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


In [32]:
print(metadata.shape)
print(ratings.shape)

(32892, 19)
(371345, 4)


## 資料整理

In [33]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [34]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [35]:
def get_user_similarity(training_data, remove_obscure_user=True, user_rating_threshold=3):

    # loading data from dataframe
    # user_to_items dict:
    # {
    #   'user': {
    #       'item': ratings...
    #   }...
    # }
    user_to_items = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])

        user_to_items[user][item] = rating

    # print("total users before filtering: ", len(user_to_items))

    # remove obscure user to decrease data size
    # filtering params
    # remove_obscure_user = True
    # user_rating_threshold = 3
    all_users = list(user_to_items.keys())
    for user in all_users:
        ratings = user_to_items[user]
        if remove_obscure_user and len(ratings) < user_rating_threshold:
            del user_to_items[user]

    # print("total users  after filtering: ", len(user_to_items))

    # generate item to user mapping dict
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for user, items in user_to_items.items():
        for item, rating in items.items():
            item_to_users[item][user] = rating

    # prepare data of computing user similarity 
    init_sim = lambda: [0 for _ in range(3)]
    factory = lambda: defaultdict(init_sim)
    pre_user_similarity = defaultdict(factory)
    n = len(item_to_users)
    index = 0
    for item, user_ratings in item_to_users.items():
        if len(user_ratings) > 1:
            # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
            for user1, user2 in combinations(user_ratings.keys(), 2):
                xy = user_ratings[user1] * user_ratings[user2]
                xx = user_ratings[user1] ** 2
                yy = user_ratings[user2] ** 2
                pre_user_similarity[user1][user2][0] += xy
                pre_user_similarity[user1][user2][1] += xx
                pre_user_similarity[user1][user2][2] += yy

                pre_user_similarity[user2][user1][0] += xy
                pre_user_similarity[user2][user1][1] += xx
                pre_user_similarity[user2][user1][2] += yy
        index += 1

    user_similarity = {}
    for src_user in pre_user_similarity: # 通常分别用SRC（source）和DST（destination）表示源操作数和目的操作数
        user_similarity_order = []
        for dst_user, val in pre_user_similarity[src_user].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:  # 不關心沒交集的
                continue
            similarity = xy / div
            if similarity < 0: # # 不關心不夠相似的
                continue
            for i, s in enumerate(user_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    user_similarity_order.insert(i, (dst_user, similarity))
                    break
            else:
                user_similarity_order.append((dst_user, similarity))  # 若都沒break就會被執行 若被break一併跳出 如果沒break則表示最小就加在最後面

        user_similarity[src_user] = user_similarity_order # {src_user:[(dst_user, similarity), (dst_user, similarity)]}
    return user_similarity, user_to_items

In [36]:
def get_item_similarity(training_data):

    # loading data from dataframe
    # item_to_users dict:
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])
        item_to_users[item][user] = rating
    # print("data converted")

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
        for user, rating in rating_users.items():
            user_to_items[user][item] = rating

    # print("data inverted")

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)
    for user, items in user_to_items.items():
        if len(items) > 1:
            for i1, i2 in combinations(items.keys(), 2):
                xy = items[i1] * items[i2]
                xx = items[i1] ** 2
                yy = items[i2] ** 2
                pre_item_similarity[i1][i2][0] += xy
                pre_item_similarity[i1][i2][1] += xx
                pre_item_similarity[i1][i2][2] += yy

                pre_item_similarity[i2][i1][0] += xy
                pre_item_similarity[i2][i1][1] += xx
                pre_item_similarity[i2][i1][2] += yy

    # print("sim data prepared")

    item_similarity = {}
    for src_item in pre_item_similarity:
        item_similarity_order = []
        for dst_item, val in pre_item_similarity[src_item].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0: # 不關心不夠相似的
                continue
            for i, s in enumerate(item_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    item_similarity_order.insert(i, (dst_item, similarity))
                    break
            else:
                item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order
    return item_similarity, user_to_items

In [37]:
import time
import datetime
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic
def get_item_similarity_surprise(training_data, k=10, user_based=False, algo=KNNBasic):
    training_data = training_data[(training_data['DATE'] + datetime.timedelta(days = 365)) > '2018-09-01']

    training_data = (
        training_data
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)
    return algo_impl

In [38]:
import datetime
from itertools import combinations
from collections import defaultdict
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    # 將訓練資料切分為長期和短期兩種不同區間，長期用來計算相似度，短期用來計算近期熱銷商品
    training_data_long = training_data[(training_data['DATE'] + datetime.timedelta(days = 360)) > '2018-09-01']
    training_data_short = training_data[(training_data['DATE'] + datetime.timedelta(days = 30)) > '2018-09-01']
    user_similarity, user_to_items_user_based = get_user_similarity(training_data_long, remove_obscure_user=True, user_rating_threshold=2)
    item_similarity, user_to_items_item_based = get_item_similarity(training_data_long)
    algo_impl = get_item_similarity_surprise(training_data_long)
    print('user_similarity', user_similarity)
    print('len(user_similarity',len(user_similarity))
    print('user from test in train:',set(users)&set(user_similarity.keys()))
    print('user from test in train:',len(set(users)&set(user_similarity.keys())))
    print('item_similarity', item_similarity)
    print('len(item_similarity)',len(item_similarity))

    # 熱銷商品排行
    grouped = training_data_short.groupby('asin').agg({'overall': 'mean', 'reviewerID': 'count'}).rename(columns={'overall': 'avg_score', 'reviewerID': 'sales_count'}).reset_index()  # 以商品分組計算平均評分&購買數量
    top_selling = grouped.sort_values(by=['sales_count', 'avg_score'], ascending=False, ignore_index=True)
    top_selling_k = top_selling['asin'].head(k).tolist()  # 前k名熱銷商品

    recommendations = {}
    for user in users:
        recommend_items = defaultdict(float)
        user_has_rated = set(training_data_long.loc[training_data_long['reviewerID'] == user]['asin'])  # 改成一年內有出現過評論紀錄的使用者都要被item_base推薦
        # 以user base cf推薦
        if user in user_similarity: # 表示至少有當過一次src_user 若無則表示此人每次都跟其他人相反或是與他人無交集的全新客戶
            sim_users = user_similarity[user]
            for sim_user, sim_score in sim_users:
                items_from_sim_user = sorted(list(user_to_items_user_based[sim_user].items()), key=lambda item: item[1]) #[(item, ratings), (item, ratings)]
                for item, ratings in items_from_sim_user:
                    if item not in user_has_rated:# 因為相似使用者所以需要避免重複商品 同時推薦的商品必須受到相似使用者好評
                        recommend_items[item] += sim_score * (ratings-2.5)/2.5  # 評分低的轉化為扣分 
        # 參考過去的購買紀錄推薦
        for item in user_has_rated:
            recommend_items[item] += 1  # 曾購買過的商品先加1分
            # 以item base cf推薦
            if item in item_similarity:
                for sim_item, sim_score in item_similarity[item]:
                    recommend_items[sim_item] += sim_score  # 相似商品也+分
            # # 以surprise item_base的方法推薦
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, k)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                recommend_items[item_raw_id] += 1 
        # 以熱銷商品推薦
        for hot_item in top_selling_k:
            recommend_items[hot_item] += 1
        recommend_items = [k for k, v in sorted(recommend_items.items(), key=lambda item:item[1], reverse=True) if v >= 1]  # 依照得分排序， 刪除<1分的商品， 轉換為list
        # print(len(recommend_items),recommend_items)
        recommendations[user] = recommend_items[:k]
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
# ratings_by_user

Computing the cosine similarity matrix...
Done computing similarity matrix.
user_similarity {'A1T2B5PFIP9TY1': [('A2RL2YV966PEF8', 1.0), ('ASBJ7GTSJWW25', 1.0), ('A219METCL6ZIVC', 1.0), ('AP3KNPYPC9WSH', 1.0), ('A3K5A511O320W5', 1.0), ('A3GAN4X1TF8TPY', 1.0), ('A4BS0C9AEFLCH', 1.0), ('A3J1HDPAR7U1Q7', 1.0), ('A35Q1O4V2WHKT8', 1.0), ('A3TG67K2M6PKFM', 1.0), ('A3FQWGPAXF0JI9', 1.0), ('AAAAWXZ4P3TS9', 1.0), ('A13NJC7UY7T3ZK', 1.0), ('AEN5OVG2K3A83', 1.0)], 'A2RL2YV966PEF8': [('A1T2B5PFIP9TY1', 1.0), ('ASBJ7GTSJWW25', 1.0), ('A219METCL6ZIVC', 1.0), ('AP3KNPYPC9WSH', 1.0), ('A3K5A511O320W5', 1.0), ('A3GAN4X1TF8TPY', 1.0), ('A4BS0C9AEFLCH', 1.0), ('A3J1HDPAR7U1Q7', 1.0), ('A35Q1O4V2WHKT8', 1.0), ('A3TG67K2M6PKFM', 1.0), ('A3FQWGPAXF0JI9', 1.0), ('AAAAWXZ4P3TS9', 1.0), ('A13NJC7UY7T3ZK', 1.0), ('AO6MLL9BTVD3D', 1.0), ('A1MAI0TUIM3R2X', 1.0), ('A36JO8IHKYBMMJ', 1.0), ('A3B6E2WUEKE9WK', 1.0), ('A1ZGHA3IJV01BK', 1.0), ('A2RQOO8VYAEZZG', 1.0), ('A30APMH1BEHJNI', 1.0), ('A29M09QBG9TZLP', 1.0), ('A

## 結果評估

In [39]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.1576271186440678