In [1]:
import pandas as pd
import numpy as np
from fine_food_review import FineFoodReview
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## Dataset

For this project, we will use the [Amazon Fine Food Reviews](https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews/) dataset. It contains reviews of fine foods from Amazon, including text reviews and ratings. Here's the list of columns in the dataset:

- `Id`: Unique identifier for each review
- `ProductId`: Unique identifier for the product
- `UserId`: Unique identifier for the user
- `ProfileName`: Name of the user profile
- `HelpfulnessNumerator`: Number of helpful votes
- `HelpfulnessDenominator`: Total number of votes
- `Score`: Rating given by the user (1 to 5)
- `Time`: Timestamp of the review
- `Summary`: Summary of the review
- `Text`: Full text of the review

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
# FFR = FineFoodReview()

FFR = FineFoodReview(dataset_file_name="fine_food_reviews_tokenized.csv", dataset_path="./data")
df = FFR.dataset

INFO:dataset:Checking if dataset is downloaded...
INFO:dataset:Dataset already downloaded.
INFO:dataset:Loading reviews dataset...
INFO:dataset:Loaded 568454 reviews.


In [None]:
df.head()

In [None]:
df.info()

In [None]:
FFR.grouped_by_user

In [None]:
FFR.grouped_by_product

In [None]:
print("Number of unique users:", FFR.grouped_by_user.size)
print("Number of unique products:", FFR.grouped_by_product.size)

In [None]:

print("User Reviews Statistics:")
display(FFR.stats["user"])

print("Product Reviews Statistics:")
display(FFR.stats["product"])

In [None]:
sns.set_theme(style="whitegrid")

fig, axes = plt.subplots(3, 2, figsize=(16, 18))
fig.suptitle("FineFoodReview Dataset Analysis with Mean & Median", fontsize=18)

# 1. Histogram of reviews per user
mean_u = FFR.grouped_by_user.mean()
median_u = FFR.grouped_by_user.median()
sns.histplot(FFR.grouped_by_user.to_numpy(), bins=50, ax=axes[0,0], color="skyblue", kde=False)
axes[0,0].axvline(mean_u, color="red", linestyle="--", label=f"Mean: {mean_u:.2f}")
axes[0,0].axvline(median_u, color="green", linestyle="-.", label=f"Median: {median_u}")
axes[0,0].set_title("Distribution of Number of Reviews per User")
axes[0,0].set_xlabel("Number of Reviews")
axes[0,0].set_ylabel("Number of Users")
axes[0,0].legend()

# 2. Histogram of reviews per product
mean_p = FFR.grouped_by_product.mean()
median_p = FFR.grouped_by_product.median()
sns.histplot(FFR.grouped_by_product.to_numpy(), bins=50, ax=axes[0,1], color="lightcoral", kde=False)
axes[0,1].axvline(mean_p, color="red", linestyle="--", label=f"Mean: {mean_p:.2f}")
axes[0,1].axvline(median_p, color="green", linestyle="-.", label=f"Median: {median_p}")
axes[0,1].set_title("Distribution of Number of Reviews per Product")
axes[0,1].set_xlabel("Number of Reviews")
axes[0,1].set_ylabel("Number of Products")
axes[0,1].legend()

# 3. CDF for users
sns.ecdfplot(FFR.grouped_by_user.to_numpy(), ax=axes[1,0], color="blue")
axes[1,0].set_title("CDF of Number of Reviews per User")
axes[1,0].set_xlabel("Number of Reviews")
axes[1,0].set_ylabel("Cumulative Fraction of Users")
axes[1,0].grid(True)


# 4. CDF for products
sns.ecdfplot(FFR.grouped_by_product.to_numpy(), ax=axes[1,1], color="darkorange")
axes[1,1].set_title("CDF of Number of Reviews per Product")
axes[1,1].set_xlabel("Number of Reviews")
axes[1,1].set_ylabel("Cumulative Fraction of Products")
axes[1,1].grid(True)

# 5. Boxplot for users
sns.boxplot(x=FFR.grouped_by_user, ax=axes[2,0], color="lightblue", showmeans=True,
            meanprops={"marker":"o","markerfacecolor":"red","markeredgecolor":"black"})
axes[2,0].set_title("Boxplot of Number of Reviews per User")
axes[2,0].set_xlabel("Number of Reviews")

# 6. Boxplot for products
sns.boxplot(x=FFR.grouped_by_product, ax=axes[2,1], color="salmon", showmeans=True,
            meanprops={"marker":"o","markerfacecolor":"red","markeredgecolor":"black"})
axes[2,1].set_title("Boxplot of Number of Reviews per Product")
axes[2,1].set_xlabel("Number of Reviews")

plt.tight_layout(rect=(0, 0, 1, 0.97))
plt.show()




From the user and product statistics, we can see that the average number of reviews per user is around 2.22, with a maximum of 448 reviews by a single user. For products, the average number of reviews is about 7.66, with a maximum of 913 reviews for a single product. This indicates that while most users review only a few products, some users are very active in reviewing multiple products. Also, looking at the 75%, we can see that 75% of users have reviewed 2 or fewer products, and 75% of products have received 5 or fewer reviews. This suggests a long tail distribution where a few users and products are highly active while the majority are not. It's important to consider these statistics when training models, as they can impact the model's ability to generalize and the importance of certain users or products in the dataset. 


In [4]:

FFR.tokenize_reviews()

INFO:fine_food_review:Tokenizing 568454 reviews...
INFO:fine_food_review:TokenizedText column already exists, skipping tokenization.


["['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'product', 'found', 'good', 'quality', 'product', 'look', 'like', 'stew', 'processed', 'meat', 'smell', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']",
 '[\'product\', \'arrived\', \'labeled\', \'jumbo\', \'salted\', \'peanutsthe\', \'peanut\', "\'re", \'actually\', \'small\', \'sized\', \'unsalted\', \'sure\', \'error\', \'vendor\', \'intended\', \'represent\', \'product\', \'jumbo\']',
 "['confection', 'around', 'century', 'light', 'pillowy', 'citrus', 'gelatin', 'nut', 'case', 'filbert', 'cut', 'tiny', 'square', 'liberally', 'coated', 'powdered', 'sugar', 'tiny', 'mouthful', 'heaven', 'chewy', 'flavorful', 'highly', 'recommend', 'yummy', 'treat', 'familiar', 'story', 'c', 'lewis', 'lion', 'witch', 'wardrobe', 'treat', 'seduces', 'edmund', 'selling', 'brother', 'sister', 'witch']",
 "['looking', 'secret', 'ingredient', 'robitussin', 'believe', 'found', 'got', 'addition', 'root', 'beer', 'extract', 'or

In [None]:
doc_vector_tfidf = FFR.vectorize_reviews(method="tfidf", )

In [5]:
doc_vector_gensim = FFR.vectorize_reviews(method="gensim")

INFO:gensim.models.keyedvectors:loading projection weights from /Users/mark/gensim-data/glove-wiki-gigaword-100/glove-wiki-gigaword-100.gz
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 100) matrix of type float32 from /Users/mark/gensim-data/glove-wiki-gigaword-100/glove-wiki-gigaword-100.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-07-11T07:37:00.308741', 'gensim': '4.3.3', 'python': '3.10.16 (main, Dec 11 2024, 10:22:29) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:fine_food_review:Vectorizing reviews using gensim...
INFO:fine_food_review:Vectorization complete. Shape: (568454, 100)


In [None]:
doc_vector_transformers = FFR.vectorize_reviews(method="sentence_transformers")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:fine_food_review:Vectorizing reviews using sentence_transformers...


In [None]:
# store the new dataset with tokenized reviews to save time in future runs
df.to_csv("data/fine_food_reviews_tokenized.csv", index=False)

In [None]:
from usl_clustering import USLClustering
usl = USLClustering(random_state=42)
X = FFR.dataset["TokenizedText"].values
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
usl.fit(X_vectorized)

In [None]:
cluster_count = 10
texts = FFR.dataset["TokenizedText"].apply(lambda tokens: " ".join(tokens)).to_list()

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
X = tfidf_vectorizer.fit_transform(texts)

kmeans = KMeans(n_clusters=cluster_count, n_init=20, random_state=42)
kmeans.fit(X)

FFR.dataset["Cluster"] = kmeans.labels_

In [None]:
FFR.dataset["Cluster"]

In [None]:
terms = tfidf_vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(kmeans.n_clusters):
    print(f"\nCluster {i}:")
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print("Top terms:", ", ".join(top_terms))

In [None]:
for i in range(cluster_count):
    print(f"\nCluster {i} examples:")
    print(FFR.dataset[FFR.dataset["Cluster"] == i]["TokenizedText"].head(3).to_list())

In [None]:
print(FFR.dataset["Cluster"].value_counts())

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, alpha=0.5)
plt.title("PCA visualization of clusters")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()