In [82]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
all_beauty_path = 'data/All_Beauty.csv'
# all_beauty_path = 'http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv'
ratings = pd.read_csv(
    all_beauty_path,
    names=["asin", "reviewerID", "overall", "unixReviewTime"],
    dtype={"asin": np.str_, "reviewerID": np.str_, "overall": np.float32, "unixReviewTime": np.float64 }
)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,143026860,A1V6B6TNIC10QE,1.0,1424304000.0,2015-02-19
1,143026860,A2F5GHSXFQ0W6J,4.0,1418861000.0,2014-12-18
2,143026860,A1572GUYS7DGSR,4.0,1407629000.0,2014-08-10
3,143026860,A1PSGLFK1NSVO,5.0,1362960000.0,2013-03-11
4,143026860,A6IKXKZMTKGSC,5.0,1324771000.0,2011-12-25


In [30]:
meta_all_beauty_path = 'data/meta_All_Beauty.json.gz'
# meta_all_beauty_path = 'http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz'
metadata = pd.read_json(
    meta_all_beauty_path,
    lines=True,
    compression="gzip"
)
metadata.head()
print(metadata.columns)

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'],
      dtype='object')


# 資料切分

In [111]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') &
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())


# EDA

In [54]:
# tech1
metadata[metadata['tech1'].str.len() > 0].head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
1088,[],"class=""a-keyvalue prodDetTable"" role=""present...",[This is the No.1 Precision Knife with 15 Repl...,,No.1 Knife w/No.11 Blade Carded : 15pc Dispenser,"[B0025189VE, B0015AOIYI, B000093IJ1, B0027ROZ9...",,X-Acto,"[Type A, all aluminum handle designed for in-t...","[>#349,378 in Beauty & Personal Care (See top ...","[B005KRSWM6, B01MUUAVDG, B07FYSW47C, B00IR9I1V...",{},All Beauty,,"December 17, 2004",,B000BRJD32,[],[]
3551,[],"class=""a-keyvalue prodDetTable"" role=""present...",[Tuck a Fragrant Drawer Sachet between your sh...,,Camille Beckman Drawer Sachet - Camille,[],,Camille Beckman,[Drawer Sachet],"[>#2,569,884 in Beauty & Personal Care (See to...",[],{},All Beauty,,"January 2, 2009",,B001OW7QI2,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
12130,[],"class=""a-keyvalue prodDetTable"" role=""present...",[This Large Pouch gives you all the storage sp...,,Large Pouch - Full Grain Leather - Black Onyx ...,[],,Leatherology,"[Full Grain Leather, 8.5""H x 12""W x 0.5""D, 2 i...","[>#527,492 in Beauty & Personal Care (See top ...","[B07MHJXZQ2, B07BLTLKGK, B009Z7DXHQ, B01MS4YRT...",{},All Beauty,,"August 31, 2012",$85.00,B00I4R3X96,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
13579,[],"class=""a-keyvalue prodDetTable"" role=""present...",[Perfect decor on formal and casual apparels t...,,Jovana Retro Big Eye Owl Pendant Design Neckla...,[],,JOVANA,"[Material: Alloy+Rhinestone, Pendant Dimension...","[>#188,112 in Beauty & Personal Care (See top ...","[B00AHOS6WS, B07GZPLLVR, B00N9X4XNU, B072PQ6Y4...",{},All Beauty,,"April 24, 2014",$2.30,B00JYGWWIO,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
19087,[],"class=""a-keyvalue prodDetTable"" role=""present...",[This items has an integrated lined fleece int...,,Touch Screen Windproof Waterproof Thermal Glov...,[],,B,[[Finally a glove with WARMTH & PERFORMANCE] M...,"[>#816,754 in Beauty & Personal Care (See top ...","[B077S9DNDT, B0786CKZRT, B07JVJHJ8H, B079Q4HYM...",{},All Beauty,,"August 29, 2017",$8.99,B00T79WJXG,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [55]:
# fit: 沒有 fit 資料
metadata[metadata['fit'].str.len() > 0].head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes


In [56]:
# similar_item
metadata[metadata['similar_item'].str.len() > 0].head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
89,[],,[],,Dual Jet Bath Spa,[],,Conair,[],"1,141,113 in Beauty &amp; Personal Care (","[B005EO2EO2, B00005A44E, B005NWFA5K, B01B0O4KJ...","{'Shipping Weight:': '5.8 pounds (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$144.18,B000067EE4,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
306,[],,[],,Prestige Classic Lipstick PL-37A Baja,[],,,[],"1,851,720 in Beauty &amp; Personal Care (",[],"{'ASIN: ': 'B00021W0IC', 'UPC:': '795827190538...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$12.95,B00021W0IC,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
308,[],,[],,Prestige Classic Lipstick PL-51A Mojave,[],,,[],"1,215,577 in Beauty &amp; Personal Care (",[],"{'Shipping Weight:': '0.6 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$4.00,B00021W0VY,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
313,[],,[],,Prestige Classic Lipstick PL-05A Pebble,[],,Prestige Cosmetics,[],"114,910 in Beauty & Personal Care (",[],"{'  Item Weight: ': '0.64 ounces', 'Sh...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$3.50,B00021XGB2,[],[]


In [57]:
# also_view
metadata[['also_view', 'details']].head()

Unnamed: 0,also_view,details
0,[],{'ASIN: ': '6546546450'}
1,[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':..."
2,"[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':..."
3,[],"{'  Item Weight: ': '1.76 ounces', 'Sh..."
4,"[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ..."


# 內容過濾的推薦算法
## 1. Item representation
計算商品用 **「標題」** 所表示的 tfidf 矩陣

In [64]:
df = metadata.drop_duplicates('title')
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(df['title'])

<32300x29623 sparse matrix of type '<class 'numpy.float64'>'
	with 352299 stored elements in Compressed Sparse Row format>

In [80]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [81]:
tf.get_feature_names_out()

array(['00', '000', '0000', ..., 'zyliss', 'zytaze', 'zzzrt'],
      dtype=object)

## 2. Profiling Learning
如何計算使用者的購買/偏好行為
### 2-1 我們計算商品間的相似程度
其實就是計算任兩筆商品之間的 cosine_similarity：

In [83]:
similarity_matrix = cosine_similarity(tfidf_matrix)

In [84]:
similarity_matrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.29881616, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.29881616, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

### 2-2 我們計算商品間的相似程度
找出每個商品回傳 k 個最相近的商品，利用剛剛的 similarity_matrix 對每個商品各自排序即可。

In [87]:
# title -> 商品index 流水號
mapping = pd.Series(df.index, index = df['title'])
mapping

title
Loud 'N Clear&trade; Personal Sound Amplifier                                                                                                                      0
No7 Lift &amp; Luminate Triple Action Serum 50ml by Boots                                                                                                          1
No7 Stay Perfect Foundation Cool Vanilla by No7                                                                                                                    2
Wella Koleston Perfect Hair Colour 44/44 Medium Intense Red Brown 60ml                                                                                             3
Lacto Calamine Skin Balance Oil control 120 ml. (Pack of 2)                                                                                                        4
                                                                                                                                                               ...  
Bari

In [109]:
def recommend_item(item_input, k=2):
    """
    :param item_input: string, 商品名
    :param k: 取出相似個數
    :return: 回傳 list
    """
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []


## 3. Recommendation Generation
這裡的想法是根據每一個使用者所購買的商品，分別把跟這些商品相似的商品加總起來就是最終的推薦。
在這裡的「加總」就是全部考慮，實務上的加總可能會採用 KNN 的方法在進行排序。

In [110]:
def recommend_items(items, k):
    """
    :param items: item title list
    :param k:
    :return:
    """
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

## 開始推薦

In [112]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    ratings_trainings = training_data
    recommendations = {user: recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title'].tolist(), k) for user in users}
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': ['B00AU92V44',
  'B014HILES2',
  'B01DPZJ6JM',
  'B01A6O5KUY',
  'B01CL3IXJI',
  'B008XNX92C',
  'B018OLOHVY',
  'B018OSUOC8',
  'B000QPXD26',
  'B00DG8Q9KS'],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': ['B016Y5KOAA',
  'B016Y

## 結果評估

In [113]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.0