# Cosine Similarity with Fashion Dataset

Citation for dataset

Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019

In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

### Import product metadata

In [43]:
for test_size_ in range(10, 70, 5):
    test_size = test_size_ / 100
    df_train, df_test = train_test_split(df_reviews, test_size=test_size, shuffle=False)
    products_to_test = list(set(df_train['asin'].values) & set(df_test['asin'].values))
    print("Test size: ", test_size, "  || No. of products to test: ", len(products_to_test))

Test size:  0.1   || No. of products to test:  8
Test size:  0.15   || No. of products to test:  8
Test size:  0.2   || No. of products to test:  8
Test size:  0.25   || No. of products to test:  8
Test size:  0.3   || No. of products to test:  7
Test size:  0.35   || No. of products to test:  7
Test size:  0.4   || No. of products to test:  6
Test size:  0.45   || No. of products to test:  6
Test size:  0.5   || No. of products to test:  5
Test size:  0.55   || No. of products to test:  5
Test size:  0.6   || No. of products to test:  5
Test size:  0.65   || No. of products to test:  4


In [3]:
test_size = 0.25

In [4]:
data = []
with gzip.open('meta_AMAZON_FASHION.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

meta_df = pd.DataFrame.from_dict(data)
meta_df

Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,0764443682,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,XCC Qi promise new spider snake preparing men'...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,[For the professional or amateur magician. Ro...,,,,,,,
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186632,JT Women's Elegant Off Shoulder Chiffon Maxi L...,JT,,"9,835,890inClothing,ShoesJewelry(",5 star,B01HJGXL4O,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
186633,Microcosm Retro Vintage Black Crochet Lace One...,Microcosm,[Package Dimensions:\n \n7....,"11,390,771inClothing,ShoesJewelry(",5 star5 star (0%),B01HJHF97K,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
186634,Lookatool Classic Plain Vintage Army Military ...,Lookatool,"[Cotton+Polyester, Imported, Item type:Basebal...","972,275inClothing,ShoesJewelry(",5 star,B01HJGJ9LS,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$8.53,"[B00XLECZMS, B0018MQAOY, B00N833I4Q, B074DQSPP...","[B07BHQ1FXL, B00XLECZMS, B07CJWM5WY, B07CS97C1...","class=""a-normal a-align-center a-spacing-smal...",,,
186635,Edith Windsor Women's Deep V-neck Beaded Sequi...,Edith Windsor,[Product Dimensions:\n \n9....,"1,964,585inClothing,ShoesJewelry(",5 star,B01HJHTH5U,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,[B077ZLGMJ3],,,,


In [5]:
meta_df = meta_df[['title', 'brand', 'feature', 'asin', 'description', 'also_view', 'also_buy', 'similar_item']]

In [6]:
# drop NaN values in title before processing
meta_df.dropna(subset=['title'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df.dropna(subset=['title'], inplace=True)


In [7]:
# Functions for calculating top 15 similar products

def get_title_from_index(index):
    return df_sample[df_sample.index == index]["title"].values

### Import reviews data

In [8]:
data = []
with gzip.open('AMAZON_FASHION_5 (1).json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

df_reviews = pd.DataFrame.from_dict(data)

In [9]:
df_reviews
df_reviews.sort_values("reviewTime")
df_reviews

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
1,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Tonya B.,Great product and price!,Five Stars,1441324800,,
2,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Tonya B.,Great product and price!,Five Stars,1441324800,,
3,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue (37867...",Tonya B.,Great product and price!,Five Stars,1441324800,,
4,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3171,5.0,True,"07 2, 2018",A2077NII5H62R2,B005AGO4LU,"{'Size:': ' 8.5 B(M) US', 'Color:': ' Green Gl...",Amazon Customer,Perfect fit!,Five Stars,1530489600,,
3172,5.0,True,"06 28, 2018",A2IBS6PIPAGAB5,B005AGO4LU,"{'Size:': ' 5 B(M) US', 'Color:': ' Wolf Grey/...",J. Avila,My favorite cross trainers!,Comfortable,1530144000,,
3173,5.0,True,"06 25, 2018",A1GTC5EVSJNCQ8,B005AGO4LU,"{'Size:': ' 8 B(M) US', 'Color:': ' Blue Tint/...",Amazon Customer,Love them fit perfect,Five Stars,1529884800,,
3174,5.0,True,"06 20, 2018",A311XHHLM12MUT,B005AGO4LU,"{'Size:': ' 9 B(M) US', 'Color:': ' Blue Tint/...",Peter,Favorite Nike shoe ever! The flex sole is exce...,Love them!,1529452800,,


In [10]:
df_train, df_test = train_test_split(df_reviews, test_size=test_size, shuffle=False)

In [11]:
df_reviews_train = df_train[['reviewerID', 'asin']]

### Some titles to be cleaned

In [12]:
boo = meta_df[meta_df['title'].str.contains('var aPageStart')]
boo

Unnamed: 0,title,brand,feature,asin,description,also_view,also_buy,similar_item
4248,var aPageStart = (new Date()).getTime();\nvar ...,,[Package Dimensions:\n \n3....,B0013HNSPS,,,,
4253,var aPageStart = (new Date()).getTime();\nvar ...,UjENA,,B0013NWB3M,,,,
4256,var aPageStart = (new Date()).getTime();\nvar ...,,,B0013PY4MQ,[Excellent Quality Disney Tinkerbell Necklace!...,,,
4257,var aPageStart = (new Date()).getTime();\nvar ...,,,B0013QVHWK,[Tassel Dangle Drop Earrings Jewelry Set of 6 ...,,,
4259,var aPageStart = (new Date()).getTime();\nvar ...,K-Swiss,,B0013T7HSK,,,,
...,...,...,...,...,...,...,...,...
6797,var aPageStart = (new Date()).getTime();\nvar ...,,[Product Dimensions:\n \n0....,B002WRMUSE,,,,
20893,var aPageStart = (new Date()).getTime();\nvar ...,,,B00B975LHK,,,,
23658,var aPageStart = (new Date()).getTime();\nvar ...,,,B00CII1JP2,,,,
23659,var aPageStart = (new Date()).getTime();\nvar ...,,,B00CII1JPM,,,,


In [13]:
# for cleanup
def getNameFromCode(hi):
    splitty = hi.split("\n//-->\n\n\n\n\n\n\n")

    try:
        splitty2 = splitty[1].split("\n")
        title = splitty2[0]
        if splitty2[0].startswith('Amazon.com'):
            title = splitty2[0].split('Amazon.com')[1]

    except IndexError:
        return None
    
    return title.strip()

In [14]:
# we leave in the colons because we will strip punctuation later
def cleanup():
    titles = []
    for i in boo['title']:
        titles.append(getNameFromCode(i))
    return titles

j = cleanup()

In [15]:
boo['title'] = j

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boo['title'] = j


In [16]:
meta_df.loc[meta_df['title'].str.contains('var aPageStart'), 'title'] = j

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


### Joining columns to get overall_info

In [17]:
meta_df["overall_info"] = meta_df["title"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df["overall_info"] = meta_df["title"]


In [18]:
meta_df

Unnamed: 0,title,brand,feature,asin,description,also_view,also_buy,similar_item,overall_info
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,0764443682,,,,,Slime Time Fall Fest [With CDROM and Collector...
1,XCC Qi promise new spider snake preparing men'...,,,1291691480,,,,,XCC Qi promise new spider snake preparing men'...
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,1940280001,[For the professional or amateur magician. Ro...,,,,Magical Things I Really Do Do Too!
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,1940735033,,,,,"Ashes to Ashes, Oranges to Oranges"
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,1940967805,,,,,Aether & Empire #1 - 2016 First Printing Comic...
...,...,...,...,...,...,...,...,...,...
186632,JT Women's Elegant Off Shoulder Chiffon Maxi L...,JT,,B01HJGXL4O,,,,,JT Women's Elegant Off Shoulder Chiffon Maxi L...
186633,Microcosm Retro Vintage Black Crochet Lace One...,Microcosm,[Package Dimensions:\n \n7....,B01HJHF97K,,,,,Microcosm Retro Vintage Black Crochet Lace One...
186634,Lookatool Classic Plain Vintage Army Military ...,Lookatool,"[Cotton+Polyester, Imported, Item type:Basebal...",B01HJGJ9LS,,"[B00XLECZMS, B0018MQAOY, B00N833I4Q, B074DQSPP...","[B07BHQ1FXL, B00XLECZMS, B07CJWM5WY, B07CS97C1...",,Lookatool Classic Plain Vintage Army Military ...
186635,Edith Windsor Women's Deep V-neck Beaded Sequi...,Edith Windsor,[Product Dimensions:\n \n9....,B01HJHTH5U,,,[B077ZLGMJ3],,Edith Windsor Women's Deep V-neck Beaded Sequi...


### Text preprocessing

In [19]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
def text_preprocessing(column):
    #make all words with lower letters
    column = column.str.lower()
    #getting rid of any punctution
    # column = column.str.replace('http\S+|www.\S+|@|%|:|,|', '', case=False)
    # column = column.str.replace('http\S+|www.\S+|@|%|:|,|&|;|~|`|!|\.|-|\+|/|\(|\)|\*|#|_|=|\'|\"|\?|\[|\]|\||\<|\>|', ' ', case=False)
    column = column.str.replace('http\S+|www.\S+', '', case=False)
    column = column.str.replace('[^\w\d\s]+', ' ', case=False)
    #spliting each sentence to words to apply previous funtions on them 
    word_tokens = column.str.split()
    keywords = word_tokens.apply(lambda x: [item for item in x if item not in stop])
    #assemble words of each sentence again and assign them in new column

    for words in keywords:
        if words is None:
            print(words)
            words = ''

    for i in range(len(keywords)):
        if keywords.get(i) is not None:
            keywords[i] = " ".join(keywords.get(i))
        else: 
            keywords[i] = ""
        column = keywords
    return column

In [21]:
# drop NaN values in title before processing
meta_df.dropna(subset=['title'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df.dropna(subset=['title'], inplace=True)


In [22]:
 meta_df['cleaned_infos'] = text_preprocessing(meta_df['overall_info'])

  column = column.str.replace('http\S+|www.\S+', '', case=False)
  column = column.str.replace('[^\w\d\s]+', ' ', case=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df['cleaned_infos'] = text_preprocessing(meta_df['overall_info'])


### Joining the Reviews and Metadata Tables

In [23]:
df_train = df_train[['reviewerID', 'asin']]
df_train

Unnamed: 0,reviewerID,asin
0,ALJ66O1Y6SLHA,B000K2PJ4K
1,ALJ66O1Y6SLHA,B000K2PJ4K
2,ALJ66O1Y6SLHA,B000K2PJ4K
3,ALJ66O1Y6SLHA,B000K2PJ4K
4,ALJ66O1Y6SLHA,B000K2PJ4K
...,...,...
2377,A2IBS6PIPAGAB5,B010RRWKT4
2378,A1GTC5EVSJNCQ8,B010RRWKT4
2379,A311XHHLM12MUT,B010RRWKT4
2380,A135SGOQMVWABQ,B010RRWKT4


In [24]:
# Map asin to product titles
k = []
asin_title_dict = {}
for asin in df_train.asin:
    if asin not in asin_title_dict:
        asin_title_dict[asin] = meta_df[meta_df.asin == asin]['title'].values[0]
        print(asin_title_dict)
        
    k.append(asin_title_dict[asin])

k
df_train["cleaned_infos"] = k

{'B000K2PJ4K': "Calvin Klein Boy's Assorted Boxer Briefs (Pack of 2)"}
{'B000K2PJ4K': "Calvin Klein Boy's Assorted Boxer Briefs (Pack of 2)", 'B000KPIHQ4': 'Powerstep Pinnacle Orthotic Shoe Insoles'}
{'B000K2PJ4K': "Calvin Klein Boy's Assorted Boxer Briefs (Pack of 2)", 'B000KPIHQ4': 'Powerstep Pinnacle Orthotic Shoe Insoles', 'B000V0IBDM': 'Powerstep Pinnacle Orthotic Shoe Insoles'}
{'B000K2PJ4K': "Calvin Klein Boy's Assorted Boxer Briefs (Pack of 2)", 'B000KPIHQ4': 'Powerstep Pinnacle Orthotic Shoe Insoles', 'B000V0IBDM': 'Powerstep Pinnacle Orthotic Shoe Insoles', 'B000YFSR5G': 'Hanes Mens EcoSmart Fleece Sweatpant'}
{'B000K2PJ4K': "Calvin Klein Boy's Assorted Boxer Briefs (Pack of 2)", 'B000KPIHQ4': 'Powerstep Pinnacle Orthotic Shoe Insoles', 'B000V0IBDM': 'Powerstep Pinnacle Orthotic Shoe Insoles', 'B000YFSR5G': 'Hanes Mens EcoSmart Fleece Sweatpant', 'B000YFSR4W': 'Hanes Mens EcoSmart Fleece Sweatpant'}
{'B000K2PJ4K': "Calvin Klein Boy's Assorted Boxer Briefs (Pack of 2)", 'B000K

In [25]:
df_train

Unnamed: 0,reviewerID,asin,cleaned_infos
0,ALJ66O1Y6SLHA,B000K2PJ4K,Calvin Klein Boy's Assorted Boxer Briefs (Pack...
1,ALJ66O1Y6SLHA,B000K2PJ4K,Calvin Klein Boy's Assorted Boxer Briefs (Pack...
2,ALJ66O1Y6SLHA,B000K2PJ4K,Calvin Klein Boy's Assorted Boxer Briefs (Pack...
3,ALJ66O1Y6SLHA,B000K2PJ4K,Calvin Klein Boy's Assorted Boxer Briefs (Pack...
4,ALJ66O1Y6SLHA,B000K2PJ4K,Calvin Klein Boy's Assorted Boxer Briefs (Pack...
...,...,...,...
2377,A2IBS6PIPAGAB5,B010RRWKT4,NIKE Women's Flex Supreme TR 4 Cross Trainer
2378,A1GTC5EVSJNCQ8,B010RRWKT4,NIKE Women's Flex Supreme TR 4 Cross Trainer
2379,A311XHHLM12MUT,B010RRWKT4,NIKE Women's Flex Supreme TR 4 Cross Trainer
2380,A135SGOQMVWABQ,B010RRWKT4,NIKE Women's Flex Supreme TR 4 Cross Trainer


### Run from here with saved data

In [26]:
# df_reviews_info.to_csv('Amazon Fashion Reviews Titles.csv')

In [27]:
df_train.to_csv('Amazon Fashion Reviews Titles Train.csv')

In [28]:
df_reviews_info = pd.read_csv('Amazon Fashion Reviews Titles Train.csv')

### Group by ReviewerID

We use this to build a vector for each unique reviewer

In [29]:
df_grouped = df_reviews_info.groupby('reviewerID')['cleaned_infos'].apply(' '. join).reset_index()

In [30]:
df_grouped

Unnamed: 0,reviewerID,cleaned_infos
0,A10RXRZE0TAKPU,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
1,A10WPKF2VH1JWY,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
2,A10X6EMVZGVX9E,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
3,A10Z8PKPUJ7YHO,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
4,A11D8CLU9H24YK,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
...,...,...
401,AZ3KLARHNWMNZ,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
402,AZ7LUSY20ZFSC,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
403,AZ7P3NH09B66S,NIKE Women's Flex Supreme TR 4 Cross Trainer N...
404,AZRZ2FB7CFNOE,NIKE Women's Flex Supreme TR 4 Cross Trainer N...


In [31]:
df_grouped['reviewerID'].nunique

<bound method IndexOpsMixin.nunique of 0      A10RXRZE0TAKPU
1      A10WPKF2VH1JWY
2      A10X6EMVZGVX9E
3      A10Z8PKPUJ7YHO
4      A11D8CLU9H24YK
            ...      
401     AZ3KLARHNWMNZ
402     AZ7LUSY20ZFSC
403     AZ7P3NH09B66S
404     AZRZ2FB7CFNOE
405     AZYHPRWLMSY9O
Name: reviewerID, Length: 406, dtype: object>

### Calculate Cosine Similarity

In [32]:
CV = CountVectorizer()
vectorised = CV.fit_transform(df_grouped['cleaned_infos'])
vectorised

<406x132 sparse matrix of type '<class 'numpy.int64'>'
	with 3325 stored elements in Compressed Sparse Row format>

In [33]:
cosine_sim = cosine_similarity(vectorised)
cosine_sim

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

### Functions for Recommending

In [34]:
def get_user_from_index(index, df_reviews_info):
    return df_reviews_info[df_reviews_info.index == index]["reviewerID"].values

def get_similar_users(index, cosine_sim, df_reviews_info):
    # print(df_reviews_info.iloc[index])
    #print()
    #print("Top 15: ")
    similar = list(enumerate(cosine_sim[index]))
    sorted_similar = sorted(similar, key = lambda x:x[1], reverse = True)
    # print(sorted_similar)
    i=0
    user_score = []
    for user in sorted_similar:
        user_score.append([get_user_from_index(user[0], df_grouped)[0], user[1]])
        # user_score.append(get_user_from_index(user[0], df_grouped)[0])
        i = i+1
        if i>15:
            break

    return user_score

def remove_duplicates(input_list):
    seen = {}
    result = []

    for sublist in input_list:
        key = sublist[0]

        if key not in seen:
            seen[key] = True
            result.append(sublist)

    return result

In [35]:
def recommend_users_for_product(product_asin, cosine_sim, df_reviews_info, df_grouped):
    users_who_bought = df_reviews_info.loc[df_reviews_info['asin'] == product_asin]['reviewerID'].values
    all_users = []
    for user in users_who_bought:
        # get user index
        user_idx = df_grouped.index[df_grouped['reviewerID'] == user]
        similar_users_list = get_similar_users(user_idx[0], cosine_sim, df_grouped)
        for sim_user in similar_users_list:
            all_users.append(sim_user)
    
    sorted_users = sorted(all_users, key=lambda x:x[1], reverse=True)
    result = remove_duplicates(sorted_users)
    return [x[0] for x in result[:15]]
    
    

### Testing for accuracy

In [36]:
df_test

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
2382,5.0,True,"05 22, 2018",A1T708NWV60HSO,B010RRWKT4,"{'Size:': ' 6 M US', 'Color:': ' Blue Tint/Gre...",Beatrice Garcia,Super comfortable and fit my small feet perfec...,LOVE my new sneakers!,1526947200,2,
2383,5.0,True,"05 16, 2018",ASECGGNQIT2RN,B010RRWKT4,"{'Size:': ' 10 M US', 'Color:': ' Black/White/...",Debra DeGrand,Love these shoes!,Very confortable!,1526428800,,
2384,5.0,True,"05 16, 2018",A39LISAY8SNHM,B010RRWKT4,"{'Size:': ' 10.5 M US', 'Color:': ' Black/Whit...",k,Comfortable,Five Stars,1526428800,,
2385,3.0,True,"05 15, 2018",A1A8RABZPTD7HL,B010RRWKT4,"{'Size:': ' 8.5 M US', 'Color:': ' Blue Tint/G...",Amazon Customer,Fit fine...did not like color in person,Three Stars,1526342400,,
2386,3.0,True,"05 15, 2018",A2BM6XH033ZQ3I,B010RRWKT4,"{'Size:': ' 9 M US', 'Color:': ' Blue Tint/Gre...",amazonuser,The shoe is too large. When you do lunges it h...,somewhat large fit,1526342400,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3171,5.0,True,"07 2, 2018",A2077NII5H62R2,B005AGO4LU,"{'Size:': ' 8.5 B(M) US', 'Color:': ' Green Gl...",Amazon Customer,Perfect fit!,Five Stars,1530489600,,
3172,5.0,True,"06 28, 2018",A2IBS6PIPAGAB5,B005AGO4LU,"{'Size:': ' 5 B(M) US', 'Color:': ' Wolf Grey/...",J. Avila,My favorite cross trainers!,Comfortable,1530144000,,
3173,5.0,True,"06 25, 2018",A1GTC5EVSJNCQ8,B005AGO4LU,"{'Size:': ' 8 B(M) US', 'Color:': ' Blue Tint/...",Amazon Customer,Love them fit perfect,Five Stars,1529884800,,
3174,5.0,True,"06 20, 2018",A311XHHLM12MUT,B005AGO4LU,"{'Size:': ' 9 B(M) US', 'Color:': ' Blue Tint/...",Peter,Favorite Nike shoe ever! The flex sole is exce...,Love them!,1529452800,,


In [37]:
df_test = df_test[['reviewerID', 'asin']]
df_test

Unnamed: 0,reviewerID,asin
2382,A1T708NWV60HSO,B010RRWKT4
2383,ASECGGNQIT2RN,B010RRWKT4
2384,A39LISAY8SNHM,B010RRWKT4
2385,A1A8RABZPTD7HL,B010RRWKT4
2386,A2BM6XH033ZQ3I,B010RRWKT4
...,...,...
3171,A2077NII5H62R2,B005AGO4LU
3172,A2IBS6PIPAGAB5,B005AGO4LU
3173,A1GTC5EVSJNCQ8,B005AGO4LU
3174,A311XHHLM12MUT,B005AGO4LU


We test using only products that appear in both the test and train set

In [38]:
products_to_test = list(set(df_train['asin'].values) & set(df_test['asin'].values))
products_to_test

['B001LNSY2Q',
 'B001IKJOLW',
 'B0058YEJ5K',
 'B0014F7B98',
 'B0092UF54A',
 'B009MA34NY',
 'B005AGO4LU',
 'B010RRWKT4']

In [39]:
accs = []
for product_asin in products_to_test:
    print("product: ", product_asin)
    rec_users = recommend_users_for_product(product_asin, cosine_sim, df_train, df_grouped)
    actual_users = set(df_test[df_test.asin == product_asin]["reviewerID"].values)
    matches = 0
    for rec_user in rec_users:
        if rec_user in actual_users:
            matches += 1
    acc = matches / len(rec_users)
    print("acc: ", acc)
    accs.append(acc)

avg_acc = sum(accs) / len(accs)
print("avg acc: ", avg_acc)

product:  B001LNSY2Q
acc:  0.13333333333333333
product:  B001IKJOLW
acc:  0.0
product:  B0058YEJ5K
acc:  0.0
product:  B0014F7B98
acc:  0.0
product:  B0092UF54A
acc:  0.0
product:  B009MA34NY
acc:  0.0
product:  B005AGO4LU
acc:  0.0
product:  B010RRWKT4
acc:  0.6666666666666666
avg acc:  0.09999999999999999


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2ca87312-dd22-4033-aec7-588defb6d391' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>