# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-02 14:42:32--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-02 14:42:32 (44.5 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-02 14:42:33--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-02 14:42:33 (34.9 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [3]:
import os 
print(os.getcwd())
metadata = getDF('./meta_All_Beauty.json.gz')
ratings = pd.read_csv('./All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

/content


In [4]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [5]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [6]:
import regex as re
from string import digits

def remove_brand(text):
    res = re.sub(brand_pattern, '', text)
    return res

def find_brand(text):
    res = re.match(brand_pattern, text)
    return res.group(0) if res else ''

def extract_price(cell):
    cell = ''.join(c for c in cell if c in digits)
    return int(cell) if len(cell) > 0 else 0
    
def find_sub_cat(cell):
    if len(cell) > 0:
        cell = str(cell).split('in ')
        cell = re.sub(r"((&amp)|&|;|'|(100)|Top|top|See| )*", '', cell[1])
        return cell.replace("\(", '').replace('(', '').replace(')', '').replace(']', '')
    else:
        return 'without_category'

ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')
brand_list = metadata['brand'].value_counts().keys().tolist()
brand_list = list(filter(lambda x:len(x)>2, brand_list))
brand_pattern = re.compile(r"(?:{})".format('|'.join([re.escape(x) for x in brand_list])))
metadata['brand_from_title'] = metadata['title'].apply(find_brand)
metadata['title_without_brand'] = metadata['title'].apply(remove_brand)
metadata['price'] = metadata['price'].apply(extract_price)
metadata['sub_category'] = metadata['rank'].str.split('in ').str[1]
metadata['sub_category'] = metadata['sub_category'].str.replace(r'&amp;', '', regex=True)
metadata['sub_category'] = metadata['sub_category'].str.replace(r'\(', '', regex=True)
metadata['sub_category'] = metadata['sub_category'].str.replace(r'&', '', regex=True)
metadata['text_data'] = metadata.agg(lambda x: f'{x["title"]}, {" ".join(x["description"])}', axis=1)
metadata['text_data'] = metadata['text_data'].apply(remove_brand)

In [7]:
metadata['sub_category'].value_counts()

Beauty  Personal Care        32380
Grocery  Gourmet Food           38
Health  Household               21
Toys  Games                      3
Sports  Outdoors                 3
Clothing, Shoes  Jewelry         2
Baby                             2
Tools  Home Improvement          1
Home  Kitchen                    1
Automotive                       1
Name: sub_category, dtype: int64

In [8]:
metadata.drop(metadata[(metadata['sub_category'] != 'BeautyPersonalCare')].index)
metadata

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,brand_from_title,title_without_brand,sub_category,text_data
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,0,6546546450,[],[],,Loud 'N Clear&trade; Personal Sound Amplifier,Beauty Personal Care,"Loud 'N Clear&trade; Personal Sound Amplifier,..."
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,4499,7178680776,[],[],No7,Lift &amp; Luminate Triple Action Serum 50ml by,Beauty Personal Care,Lift &amp; Luminate Triple Action Serum 50ml ...
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,2876,7250468162,[],[],No7,Stay Perfect Foundation Cool Vanilla by,Beauty Personal Care,"Stay Perfect Foundation Cool Vanilla by , St..."
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,0,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,Wella,Koleston Perfect Hair Colour 44/44 um Intense...,Beauty Personal Care,Koleston Perfect Hair Colour 44/44 um Intense...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,1215,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,Lacto mine Skin Balance Oil control 120 ml. (P...,Beauty Personal Care,Lacto mine Skin Balance Oil control 120 ml. (P...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32887,[],,[],,"Barielle Pro Textured Grip Cuticle Nipper, Purple",[],,,[],"2,145,325 in Beauty & Personal Care (",[],{'ASIN: ': 'B01HIWLLUK'},All Beauty,,,995,B01HIWLLUK,[],[],,"Barielle Pro Textured Grip Cuticle Nipper, Purple",Beauty Personal Care,"Barielle Pro Textured Grip Cuticle Nipper, Pur..."
32888,[],,[],,(Buy 3 Get 1 Free) Salon Perfect Eye Makeup Co...,[],,Salon Perfect,[],"1,639,713 in Beauty & Personal Care (",[],"{'ASIN: ': 'B01HJ1K3YK', 'UPC:': '671635851871'}",All Beauty,,,0,B01HJ1K3YK,[],[],,(Buy 3 Get 1 Free) Eye Makeup Corrector Stick...,Beauty Personal Care,(Buy 3 Get 1 Free) Eye Makeup Corrector Stick...
32889,[],,[],,NOW D-Mannose 500 mg - 120 Veg Capsules (Pack ...,"[B01KON9B4S, B079X3YFXS, B00M79OYS6, B000JN4CR...",,,[],"207,410 in Beauty & Personal Care (","[B01KON9B4S, B000JN4CR0, B071ZHMRHS, B01HJ84TN...","{'Shipping Weight:': '1 pounds (', 'ASIN: ': '...",All Beauty,,,5563,B01HJ84SGM,[],[],NOW,D-Mannose 500 mg - 120 Veg Capsules (Pack of 3),Beauty Personal Care,D-Mannose 500 mg - 120 Veg Capsules (Pack of ...
32890,[],,[Brand new and high quality<br> Enables fast v...,,12 White Feather Shuttlecocks Birdies Badminto...,[],,GBSTORE,[],"965,673 in Beauty & Personal Care (",[],"{'Shipping Weight:': '4.8 ounces (', 'ASIN: ':...",All Beauty,,,1299,B01HJASD20,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,12 White Shuttlecocks Birdies Badminton Train...,Beauty Personal Care,12 White Shuttlecocks Birdies Badminton Train...


## 資料切分

In [9]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 處理文字資料

In [10]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
# 計算商品用標題所表示的 tfidf 矩陣
df = metadata.drop_duplicates('title')
title_without_brand = df['text_data'].to_list()
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser', 'textcat'])
nlp.Defaults.stop_words |= {'amp', 'perfect'}
for idx, sent in enumerate(title_without_brand):
    title_without_brand[idx] = re.sub('[0-9.,-_]*', '', sent)
title_without_brand = list(nlp.pipe(title_without_brand))

In [12]:
title_without_brand_feature = list()
stop_word = nlp.Defaults.stop_words
pos_tag = set(['NOUN', 'PROPN', 'VERB', 'ADJ'])
for idx, sent in enumerate(title_without_brand):
    title = list()
    for word in sent:
        if  (not((word.norm_ in stop_word) or (word.is_punct))) and word.pos_ in pos_tag:
            title.append(word.norm_)
    title_without_brand_feature.append(' '.join(title))
print(len(title_without_brand_feature))

32300


In [13]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(title_without_brand_feature)

In [14]:
len(vectorizer.get_feature_names())



55668

In [15]:
# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['title'])

# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res


In [16]:
print(df['title'][5:6].to_list())
item_list = recommend_items(df['title'][2:3].to_list(), k=10)
for item in item_list:
    print(df[(df['asin']==item)]['title'].to_list())

['Mary Kay Satin Hands Hand Cream Travel MINI Size Set of 6']
['No7 Stay Perfect Foundation Cool Vanilla by No7']
['Boots No7 Stay Perfect Foundation 30ml - Calico']
['Boots No7 Stay Perfect Amazing Eyes Pencil, Deep Purple 0.04 oz (1 g)']
['Boots No7 Stay Perfect Blemish Cover, Medium Deep, .15 oz']
['No 7 Stay Perfect Lip Stain 2.5g Ruby']
['2 x Rimmel London Stay Matte Liquid Mousse Foundation 30ml - 103 True Ivory']
['(6 Pack) RIMMEL LONDON Stay Matte Liquid Mousse Foundation - Sand']
["Benefit Cosmetics Stay Don't Stray Stay Put Primer Medium/Deep 0.33 FL OZ"]
['EX1 Cosmetics Invisiwear Liquid Foundation Number F300']
["Victoria's Secret Beauty Rush Stay Awhile"]


In [17]:
def recommender(training_data, time, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    target = training_data[
                (training_data['DATE']>=time)&
                (training_data['overall']>=3)]
    cold_start = target['asin'].value_counts().sort_values(ascending=False).keys()[:k]
    user_set = training_data['reviewerID'].unique()
    recommendations = {}
    ratings_trainings = training_data
    count = 0
    for user in users:
        recommend_item = list()
        if user not in user_set:
            recommendations[user] = cold_start
        else:
            buy_item = metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]
            similar_item = recommend_items(buy_item['title'].tolist(), 2*k)
            buy_item_brand = metadata[metadata['brand_from_title'].isin(buy_item['brand_from_title'].tolist())]['asin'].tolist()
            buy_item_brand_2 = metadata[metadata['brand'].isin(buy_item['brand'].tolist())]['asin'].tolist()
            top_brand_item = training_data[((training_data['asin'].isin(buy_item_brand+buy_item_brand_2))&
                                            (training_data['DATE']>=time)&
                                            (training_data['overall']>=3))|
                                           (training_data['asin'].isin(similar_item))
                                          ]
            top_brand_item = top_brand_item['asin'].value_counts().sort_values(ascending=False).keys()[:k]
            recommendations[user] = top_brand_item.tolist()
            count += 1
    return recommendations

## 結果評估

In [18]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score
time_list = []
time_list = ['2018-07-01','2018-05-01', '2018-01-01', '2016-01-01']
for time in time_list:
    ratings_by_user = recommender(ratings_trainings, time, users)
    score = evaluate(ratings_testings_by_user, ratings_by_user)
    print(f'{time} score: {score}')

2018-07-01 score: 0.13559322033898305
2018-05-01 score: 0.11016949152542373
2018-01-01 score: 0.09661016949152543
2016-01-01 score: 0.08135593220338982
