<img src="http://www.codeheroku.com/static/blog/images/pid14_rs_diff.png">

使用scikit-learn library進行基於內容的推薦系統。

Suppose, you are given the following two texts:

Text A: London Paris London

Text B: Paris Paris London

How would you find the similarity between Text A and Text B?

1. Text A: Contains the word “London” 2 times and the word “Paris” 1 time.
2. Text B: Contains the word “London” 1 time and the word “Paris” 2 times.
<img src="http://www.codeheroku.com/static/blog/images/pid14_find_cos_theta.png">

In [1]:
text = ["London Paris London","Paris Paris London"]

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(text)

print(cv.get_feature_names())
print(count_matrix.toarray())

['london', 'paris']
[[2 1]
 [1 2]]


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(count_matrix)
print(similarity_scores)

[[1.  0.8]
 [0.8 1. ]]


=====================================================

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None 

In [4]:
# Step 1:Read CSV File
data = pd.read_csv('./data_file/product.csv')
data.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [5]:
df = data[:10000]
df['index'] = pd.Series(range(1,10001) )
df['PRODUCT_ID'] = 'P' + (df['PRODUCT_ID'].astype(str))

In [6]:
df

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT,index
0,P25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1
1,P26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,2
2,P26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,3
3,P26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,4
4,P26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,5
...,...,...,...,...,...,...,...,...
9995,P855165,1411,DRUG GM,National,HOSIERY/SOCKS,NO-NONSENSE,,9996
9996,P855179,2390,SPIRITS,National,LIQUOR,BOURBON/TN WHISKEY,750ML,9997
9997,P855192,1251,GROCERY,National,SOUP,RTS SOUP: CHUNKY/HOMESTYLE ET,18.8 OZ,9998
9998,P855202,794,GROCERY,National,COLD CEREAL,ADULT CEREAL,17.7 OZ,9999


In [7]:
# Step 2: Select Features
# 選擇產品特徵
features = ['DEPARTMENT','BRAND','COMMODITY_DESC','SUB_COMMODITY_DESC']

In [8]:
# Step 3: Create a column in DF which combines all selected features
def combine_features(row):
    return row['DEPARTMENT']+" "+row['BRAND']+" "+row['COMMODITY_DESC']+" "+row['SUB_COMMODITY_DESC']

for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string

df["combined_features"] = df.apply(combine_features,axis=1) #applying combined_features() method over each rows of dataframe and storing the combined string in "combined_features" column


In [9]:
# Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

In [10]:
print(cv.get_feature_names()[:100])
print(count_matrix.toarray())
print(len(count_matrix.toarray()))

['10', '100', '12', '15', '15lbs', '15pk', '16', '18', '18in', '20pk', '24pk', '42', '42p', '4pk', '50', '5lt', '6pk', '750ml', '8pc', '90', '99', 'abrasives', 'acc', 'access', 'accessiores', 'accessories', 'accss', 'acids', 'acne', 'action', 'activewear', 'activity', 'add', 'added', 'additi', 'additives', 'adhesives', 'adidas', 'adult', 'adults', 'aerosol', 'aerosols', 'aftersun', 'age', 'aid', 'aids', 'air', 'albums', 'alcoholic', 'ale', 'ales', 'alkaline', 'all', 'allergy', 'allieds', 'almay', 'alpo', 'alternative', 'alternatives', 'aluminum', 'american', 'ammonia', 'analgesics', 'and', 'angel', 'animal', 'anjou', 'annuals', 'ant', 'antacids', 'anti', 'antibiotic', 'antiperspirants', 'apparel', 'appetizers', 'apple', 'apples', 'appliances', 'applicators', 'aquarium', 'area', 'aseptic', 'asian', 'asparagus', 'aspic', 'audio', 'australian', 'authentic', 'auto', 'automatic', 'automobile', 'automotive', 'aviation', 'avocado', 'ba', 'baby', 'babyfood', 'back', 'bacon', 'bag']
[[0 0 0 ...

In [12]:
# Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

In [13]:
print(cosine_sim)

[[1.         0.09245003 0.         ... 0.21081851 0.23570226 0.        ]
 [0.09245003 1.         0.         ... 0.0877058  0.09805807 0.0836242 ]
 [0.         0.         1.         ... 0.         0.         0.10660036]
 ...
 [0.21081851 0.0877058  0.         ... 1.         0.2236068  0.        ]
 [0.23570226 0.09805807 0.         ... 0.2236068  1.         0.        ]
 [0.         0.0836242  0.10660036 ... 0.         0.         1.        ]]


In [14]:
def get_title_from_index(index):
    return df[df.index == index]["PRODUCT_ID"].values[0]
def get_index_from_title(title):
    return df[df.PRODUCT_ID == title]["index"].values[0]

In [15]:
# Step 6: Get index of this product from its title
product_user_likes = "P25671"
product_index = get_index_from_title(product_user_likes)
sim_products = list(enumerate(cosine_sim[product_index])) 

In [16]:
sim_products[:5]

[(0, 0.09245003270420485), (1, 1.0), (2, 0.0), (3, 0.0), (4, 0.0)]

In [17]:
# Step 7: Get a list of similar product in descending order of similarity score
sorted_sim_products = sorted(sim_products,key=lambda x:x[1],reverse=True)[1:]


In [18]:
# 只抓前10個
i=0
print("Top 10 similar products to "+product_user_likes+" are:\n")
for product in sorted_sim_products:
    print(get_title_from_index(product[0]))
    i=i+1
    if i>10:
        break

Top 10 similar products to P25671 are:

P821675
P821773
P822647
P824919
P828324
P833349
P834547
P837401
P840426
P840851
P842544
