In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from math import sqrt

In [2]:
tea_df = pd.read_csv('data/tea_data.csv')
user_tea_df = pd.read_csv('data/user_tea_pairs.csv')

user_item_matrix = tea_df.pivot_table(
    index='Account Id',
    columns='Tea Name',
    values='Total Rating')
user_item_matrix_filled = user_item_matrix.fillna(0)
print(user_item_matrix_filled.shape)
user_item_matrix_filled.head()

(49, 3239)


Tea Name,Kokyu Sen-cha Green Tea (2020),Kona-cha,Organic Assam Tea (TGFOP),"""Ara Cha"" Premium Japanese Green Tea","""Ye Zhu Tang"" Wild Arbor Raw Pu-erh","""Yu Luo"" High Mountain Pure Bud Green Tea",#12 Shira-Ore Stem Tea With Matcha,#146 - Organic Gunpowder,#209 - Lady Londonderry,'Wu Yi' Water Fairy Oolong,...,Zhu Shan Jin Xuan,Zhu Xiang Ji brand Yu Lei dark tea,Zingiber Ginger Coconut Rooibos Tea,Zocolatte Spice™ Herbal Tea,get clean - No. 7,get heart - No.12,get lost - No. 6,get relaxed - No.14,get soothed - No. 8,laohuangpian sheng (raw) pu-erh from ancient tea tree 2014 spring
Account Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,90.0,82.0,0.0,84.0,0.0,0.0,0.0,80.0,...,0.0,0.0,53.0,0.0,0.0,0.0,0.0,67.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,67.0,0.0,77.0,0.0,0.0


In [3]:
user_item_matrix_filled = user_item_matrix.fillna(0)
scaler = StandardScaler()
user_item_matrix_scaled = pd.DataFrame(
    scaler.fit_transform(user_item_matrix_filled),
    index=user_item_matrix_filled.index,
    columns=user_item_matrix_filled.columns)

In [4]:
# uu similarity 
user_similarity = cosine_similarity(user_item_matrix_filled)
user_sim_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix_filled.index,
    columns=user_item_matrix_filled.index)

print(user_sim_df.iloc[:5, :5])

Account Id        1         20        21        23        25
Account Id                                                  
1           1.000000  0.042975  0.064191  0.100258  0.128174
20          0.042975  1.000000  0.127626  0.000000  0.015546
21          0.064191  0.127626  1.000000  0.127890  0.061571
23          0.100258  0.000000  0.127890  1.000000  0.134679
25          0.128174  0.015546  0.061571  0.134679  1.000000


In [5]:
# predict tea rating 
def predict_rating(user_id, tea_name, k=5):
    if tea_name not in user_item_matrix_filled.columns:
        return np.nan
    sim_scores = user_sim_df.loc[user_id]
    tea_ratings = user_item_matrix_filled[tea_name]
    mask = tea_ratings != 0
    sim_scores = sim_scores[mask]
    tea_ratings = tea_ratings[mask]
    top_k_users = sim_scores.sort_values(ascending=False).head(k)
    top_k_ratings = tea_ratings[top_k_users.index]
    if top_k_users.sum() == 0:
        return np.nan
    return np.dot(top_k_users, top_k_ratings) / top_k_users.sum()

In [6]:
# recommending top n teas 
def recommend_top_teas(user_id, n=5):
    user_ratings = user_item_matrix_filled.loc[user_id]
    unrated_teas = user_ratings[user_ratings == 0].index
    predicted_scores = {}
    for tea in unrated_teas:
        pred = predict_rating(user_id, tea)
        if not np.isnan(pred):
            predicted_scores[tea] = pred
    sorted_recs = sorted(predicted_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_recs[:n]

In [7]:
# train/test split 
ratings = []
for user in user_item_matrix_filled.index:
    for tea in user_item_matrix_filled.columns:
        rating = user_item_matrix_filled.loc[user, tea]
        if rating != 0:
            ratings.append((user, tea, rating))

random.seed(42)
random.shuffle(ratings)
split_idx = int(len(ratings) * 0.8)
train_ratings = ratings[:split_idx]
test_ratings = ratings[split_idx:]
train_matrix = user_item_matrix_filled.copy()
for user, tea, _ in test_ratings:
    train_matrix.loc[user, tea] = 0

train_similarity = cosine_similarity(train_matrix)
train_sim_df = pd.DataFrame(
    train_similarity,
    index=train_matrix.index,
    columns=train_matrix.index)

In [8]:
# training set prediction 
def predict_on_train(user_id, tea_name, k=5):
    if tea_name not in train_matrix.columns:
        return np.nan
    sim_scores = train_sim_df.loc[user_id]
    tea_ratings = train_matrix[tea_name]
    mask = tea_ratings != 0
    sim_scores = sim_scores[mask]
    tea_ratings = tea_ratings[mask]
    top_k_users = sim_scores.sort_values(ascending=False).head(k)
    top_k_ratings = tea_ratings[top_k_users.index]
    if top_k_users.sum() == 0:
        return np.nan
    return np.dot(top_k_users, top_k_ratings) / top_k_users.sum()

# finding rmse 
actuals = []
preds = []
for user, tea, actual_rating in test_ratings:
    pred = predict_on_train(user, tea)
    if not np.isnan(pred):
        actuals.append(actual_rating)
        preds.append(pred)
rmse = sqrt(mean_squared_error(actuals, preds))
print(f"RMSE on test: {rmse:.4f}")

RMSE on test: 17.9071
