In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.linear_model import LinearRegression

# Collaborative Filtering

In [2]:
tea_df = pd.read_csv('data/tea_data.csv')
tea_df['Tea ID'] = tea_df['Tea Name'] + tea_df['Brand']
tea_df['Tea ID'] = tea_df['Tea ID'].astype('category').cat.codes

user_item_matrix = tea_df.pivot_table(
    index='Account Id',
    columns='Tea ID',
    values='Total Rating')
user_item_matrix_filled = user_item_matrix.fillna(0)
print(user_item_matrix_filled.shape)
user_item_matrix_filled.head()

(49, 3547)


Tea ID,0,1,2,3,4,5,6,7,8,9,...,3537,3538,3539,3540,3541,3542,3543,3544,3545,3546
Account Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,90.0,82.0,0.0,84.0,0.0,0.0,0.0,80.0,...,0.0,0.0,53.0,0.0,0.0,0.0,0.0,67.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,67.0,0.0,77.0,0.0,0.0


In [3]:
# uu similarity 
user_similarity = cosine_similarity(user_item_matrix_filled)
user_sim_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix_filled.index,
    columns=user_item_matrix_filled.index)

print(user_sim_df.iloc[:5, :5])

Account Id        1         20        21        23        25
Account Id                                                  
1           1.000000  0.030373  0.061106  0.095619  0.119010
20          0.030373  1.000000  0.089132  0.000000  0.015417
21          0.061106  0.089132  1.000000  0.127890  0.061058
23          0.095619  0.000000  0.127890  1.000000  0.100955
25          0.119010  0.015417  0.061058  0.100955  1.000000


In [4]:
# predict tea rating
def predict_rating_collab(user_id, tea_name, similarity, user_item_ratings, k=5):
    if tea_name not in user_item_ratings.columns:
        return np.nan
    sim_scores = similarity.loc[user_id]
    tea_ratings = user_item_ratings[tea_name]
    mask = tea_ratings != 0
    sim_scores = sim_scores[mask]
    tea_ratings = tea_ratings[mask]
    top_k_users = sim_scores.sort_values(ascending=False).head(k)
    top_k_ratings = tea_ratings[top_k_users.index]
    if top_k_users.sum() == 0: return np.nan
    return np.dot(top_k_users, top_k_ratings) / top_k_users.sum()

# Content-Based Filtering

In [5]:
embeds = pd.read_csv("data/content_embeddings.csv", header = None)
embeds['Tea ID'] = tea_df['Tea ID']
embeds = embeds.drop_duplicates(subset='Tea ID')
embeds = embeds.set_index('Tea ID')

In [6]:
tea_matrix_content = embeds
tea_names_content = embeds.index

# cosine similarity 
tea_similarity_content = cosine_similarity(tea_matrix_content)
tea_similarity_content = 1 - tea_similarity_content
tea_similarity_content_df = pd.DataFrame(
    tea_similarity_content,
    index=embeds.index,
    columns=embeds.index)

In [7]:
def predict_rating_content(user_id, tea_name, similarity, user_item_ratings, k=5):       
    sim_scores = similarity.loc[tea_name]
    user_ratings = user_item_ratings.loc[user_id, :]
    mask = user_ratings != 0
    sim_scores = sim_scores[mask]
    user_ratings = user_ratings[mask]
    top_k_items = sim_scores.sort_values(ascending=False).head(k)
    top_k_ratings = user_ratings[top_k_items.index]
    if top_k_items.sum() == 0: return np.nan
    return np.dot(top_k_items, top_k_ratings) / top_k_items.sum()

# Hybrid

Prepare Data for Training and Testing

In [8]:
# get ratings, tea, user data
ratings_data = []
for user in user_item_matrix_filled.index:
    for tea in user_item_matrix_filled.columns:
        rating = user_item_matrix_filled.loc[user, tea]
        if rating != 0:
            ratings_data.append((user, tea, rating))


# train/test split
random.seed(42)
random.shuffle(ratings_data)
split_idx = int(len(ratings_data) * 0.8)

train_ratings = ratings_data[:split_idx]
test_ratings = ratings_data[split_idx:]
train_user_item_matrix = user_item_matrix_filled.copy()

# prepare user_item rating matrix for training
for user, tea, _ in test_ratings:
    train_user_item_matrix.loc[user, tea] = 0


train_similarity = cosine_similarity(train_user_item_matrix)
train_sim_df = pd.DataFrame(
    train_similarity,
    index=train_user_item_matrix.index,
    columns=train_user_item_matrix.index)

Generate Test Predictions

In [9]:
def combine_ratings_simple(collab_rating, content_rating, collab_w = .5):
    return collab_rating * collab_w + content_rating * (1-collab_w)

In [10]:
# finding rmse
actuals = []
preds = []
preds_coll = []
preds_cont = []
for user, tea, actual_rating in test_ratings:
    # get collaborative, content, and hybrid ratings
    collab_pred_rating = predict_rating_collab(user, tea, train_sim_df, train_user_item_matrix)
    content_pred_rating = predict_rating_content(user, tea, tea_similarity_content_df, train_user_item_matrix)
    hybrid_pred_rating = combine_ratings_simple(collab_pred_rating, content_pred_rating, collab_w=.5)
    
    if not np.isnan(collab_pred_rating):
        actuals.append(actual_rating)
        preds.append(hybrid_pred_rating)
        preds_coll.append(collab_pred_rating)
        preds_cont.append(content_pred_rating)

In [11]:
rmse_collab = sqrt(mean_squared_error(actuals, preds_coll))
rmse_content = sqrt(mean_squared_error(actuals, preds_cont))
rmse_hybrid = sqrt(mean_squared_error(actuals, preds))
print(f"RMSE on test collab: {rmse_collab:.4f}")
print(f"RMSE on test content: {rmse_content:.4f}")
print(f"RMSE on test hybrid: {rmse_hybrid:.4f}")

RMSE on test collab: 17.0660
RMSE on test content: 18.3908
RMSE on test hybrid: 15.4167


# Tea Recommendation

In [12]:
tea_name_lookup = tea_df[['Tea Name', 'Tea ID']].drop_duplicates()
tea_name_lookup = tea_name_lookup.set_index('Tea ID')

In [13]:
# recommending top n teas
def recommend_top_teas(user_id, similarity_coll, similarity_cont, user_item_ratings, n=5):
    user_ratings = user_item_ratings.loc[user_id]
    unrated_teas = user_ratings[user_ratings == 0].index
    predicted_scores = {}
    for tea in unrated_teas:
        collab_pred_rating = predict_rating_collab(user, tea, similarity_coll, user_item_ratings)
        content_pred_rating = predict_rating_content(user, tea, similarity_cont, user_item_ratings)
        pred = combine_ratings_simple(collab_pred_rating, content_pred_rating)
            
        if not np.isnan(pred):
            predicted_scores[tea] = pred
    sorted_recs = sorted(predicted_scores.items(), key=lambda x: x[1], reverse=True)
    teas = sorted_recs[:n]
    for t in range(len(teas)):
        teas[t] = (tea_name_lookup.loc[teas[t][0]].item(), round(teas[t][1], 3))
    return teas

In [14]:
teas = recommend_top_teas(1, user_sim_df, tea_similarity_content_df, user_item_matrix_filled)
teas

[('Berries and Cherries', 89.334),
 ('Orchid Vanilla', 88.425),
 ('Green Tea Pomegranate', 88.202),
 ('Jinggu Golden Strand Pure Bud Yunnan Black Tea - Spring 2014', 88.175),
 ('Crane Monk - Light Oolong', 88.172)]