In [1]:
import pandas as pd
import numpy as np
from fine_food_review import FineFoodReview
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
FFR = FineFoodReview()
df = FFR.dataset

INFO:dataset:Checking if dataset is downloaded...
INFO:dataset:Dataset already downloaded.
INFO:dataset:Loading reviews dataset...
INFO:dataset:Loaded 568454 reviews.


In [None]:
df.head()

In [None]:
df.info()

In [None]:
FFR.grouped_by_user

In [None]:
FFR.grouped_by_product

In [None]:
print("Number of unique users:", FFR.grouped_by_user.size)
print("Number of unique products:", FFR.grouped_by_product.size)

In [None]:

print("User Reviews Statistics:")
display(FFR.stats["user"])

print("Product Reviews Statistics:")
display(FFR.stats["product"])

In [None]:
sns.set_theme(style="whitegrid")

fig, axes = plt.subplots(3, 2, figsize=(16, 18))
fig.suptitle("FineFoodReview Dataset Analysis with Mean & Median", fontsize=18)

# 1. Histogram of reviews per user
mean_u = FFR.grouped_by_user.mean()
median_u = FFR.grouped_by_user.median()
sns.histplot(FFR.grouped_by_user.to_numpy(), bins=50, ax=axes[0,0], color="skyblue", kde=False)
axes[0,0].axvline(mean_u, color="red", linestyle="--", label=f"Mean: {mean_u:.2f}")
axes[0,0].axvline(median_u, color="green", linestyle="-.", label=f"Median: {median_u}")
axes[0,0].set_title("Distribution of Number of Reviews per User")
axes[0,0].set_xlabel("Number of Reviews")
axes[0,0].set_ylabel("Number of Users")
axes[0,0].legend()

# 2. Histogram of reviews per product
mean_p = FFR.grouped_by_product.mean()
median_p = FFR.grouped_by_product.median()
sns.histplot(FFR.grouped_by_product.to_numpy(), bins=50, ax=axes[0,1], color="lightcoral", kde=False)
axes[0,1].axvline(mean_p, color="red", linestyle="--", label=f"Mean: {mean_p:.2f}")
axes[0,1].axvline(median_p, color="green", linestyle="-.", label=f"Median: {median_p}")
axes[0,1].set_title("Distribution of Number of Reviews per Product")
axes[0,1].set_xlabel("Number of Reviews")
axes[0,1].set_ylabel("Number of Products")
axes[0,1].legend()

# 3. CDF for users
sns.ecdfplot(FFR.grouped_by_user.to_numpy(), ax=axes[1,0], color="blue")
axes[1,0].set_title("CDF of Number of Reviews per User")
axes[1,0].set_xlabel("Number of Reviews")
axes[1,0].set_ylabel("Cumulative Fraction of Users")
axes[1,0].grid(True)


# 4. CDF for products
sns.ecdfplot(FFR.grouped_by_product.to_numpy(), ax=axes[1,1], color="darkorange")
axes[1,1].set_title("CDF of Number of Reviews per Product")
axes[1,1].set_xlabel("Number of Reviews")
axes[1,1].set_ylabel("Cumulative Fraction of Products")
axes[1,1].grid(True)

# 5. Boxplot for users
sns.boxplot(x=FFR.grouped_by_user, ax=axes[2,0], color="lightblue", showmeans=True,
            meanprops={"marker":"o","markerfacecolor":"red","markeredgecolor":"black"})
axes[2,0].set_title("Boxplot of Number of Reviews per User")
axes[2,0].set_xlabel("Number of Reviews")

# 6. Boxplot for products
sns.boxplot(x=FFR.grouped_by_product, ax=axes[2,1], color="salmon", showmeans=True,
            meanprops={"marker":"o","markerfacecolor":"red","markeredgecolor":"black"})
axes[2,1].set_title("Boxplot of Number of Reviews per Product")
axes[2,1].set_xlabel("Number of Reviews")

plt.tight_layout(rect=(0, 0, 1, 0.97))
plt.show()




From the user and product statistics, we can see that the average number of reviews per user is around 2.22, with a maximum of 448 reviews by a single user. For products, the average number of reviews is about 7.66, with a maximum of 913 reviews for a single product. This indicates that while most users review only a few products, some users are very active in reviewing multiple products. Also, looking at the 75%, we can see that 75% of users have reviewed 2 or fewer products, and 75% of products have received 5 or fewer reviews. This suggests a long tail distribution where a few users and products are highly active while the majority are not. It's important to consider these statistics when training models, as they can impact the model's ability to generalize and the importance of certain users or products in the dataset. 


In [None]:
df[df['HelpfulnessDenominator'] != df['HelpfulnessNumerator']]

In [4]:

FFR.tokenize_reviews()


INFO:fine_food_review:Tokenizing 568454 reviews...


['bought several vitality canned dog food product found good quality product look like stew processed meat smell better labrador finicky appreciates product better',
 'product arrived labeled jumbo salted peanutsthe peanut actually small sized unsalted sure error vendor intended represent product jumbo',
 'confection around century light pillowy citrus gelatin nut case filbert cut tiny square liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story c lewis lion witch wardrobe treat seduces edmund selling brother sister witch',
 'looking secret ingredient robitussin believe found got addition root beer extract ordered good made cherry soda flavor medicinal',
 'great taffy great price wide assortment yummy taffy delivery quick taffy lover deal',
 'got wild hair taffy ordered five pound bag taffy enjoyable many flavor watermelon root beer melon peppermint grape etc complaint bit much redblack licoriceflavored piece particular favorit

In [5]:
FFR.dataset["TokenizedText"]

0         [bought, several, vitality, canned, dog, food,...
1         [product, arrived, labeled, jumbo, salted, pea...
2         [confection, around, century, light, pillowy, ...
3         [looking, secret, ingredient, robitussin, beli...
4         [great, taffy, great, price, wide, assortment,...
                                ...                        
568449    [great, sesame, chickenthis, good, better, res...
568450    [im, disappointed, flavor, chocolate, note, es...
568451    [star, small, give, 1015, one, training, sessi...
568452    [best, treat, training, rewarding, dog, good, ...
568453    [satisfied, product, advertised, use, cereal, ...
Name: TokenizedText, Length: 568454, dtype: object

In [6]:
cluster_count = 10
texts = FFR.dataset["TokenizedText"].apply(lambda tokens: " ".join(tokens)).to_list()

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
X = tfidf_vectorizer.fit_transform(texts)

kmeans = KMeans(n_clusters=cluster_count, n_init=20, random_state=42)
kmeans.fit(X)

FFR.dataset["Cluster"] = kmeans.labels_

In [7]:
FFR.dataset["Cluster"]

0         9
1         3
2         3
3         0
4         3
         ..
568449    3
568450    1
568451    9
568452    9
568453    3
Name: Cluster, Length: 568454, dtype: int32

In [8]:
terms = tfidf_vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(kmeans.n_clusters):
    print(f"\nCluster {i}:")
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print("Top terms:", ", ".join(top_terms))


Cluster 0:
Top terms: taste, like, flavor, good, br, great, really, water, sugar, drink

Cluster 1:
Top terms: chocolate, cooky, cookie, taste, like, dark, good, hot, flavor, love

Cluster 2:
Top terms: tea, green, taste, flavor, drink, like, good, bag, great, love

Cluster 3:
Top terms: product, great, love, amazon, price, good, store, time, buy, order

Cluster 4:
Top terms: coffee, cup, like, flavor, strong, taste, good, roast, bold, great

Cluster 5:
Top terms: bar, snack, chocolate, taste, protein, like, good, kind, great, nut

Cluster 6:
Top terms: chip, potato, bag, flavor, salt, like, taste, great, love, snack

Cluster 7:
Top terms: sauce, pasta, hot, like, taste, flavor, br, good, great, noodle

Cluster 8:
Top terms: cat, food, eat, love, treat, like, br, dry, chicken, vet

Cluster 9:
Top terms: dog, treat, food, love, like, product, chew, great, br, eat


In [9]:
for i in range(cluster_count):
    print(f"\nCluster {i} examples:")
    print(FFR.dataset[FFR.dataset["Cluster"] == i]["TokenizedText"].head(3).to_list())


Cluster 0 examples:
[['looking', 'secret', 'ingredient', 'robitussin', 'believe', 'found', 'got', 'addition', 'root', 'beer', 'extract', 'ordered', 'good', 'made', 'cherry', 'soda', 'flavor', 'medicinal'], ['taffy', 'good', 'soft', 'chewy', 'flavor', 'amazing', 'would', 'definitely', 'recommend', 'buying', 'satisfying'], ['love', 'eating', 'good', 'watching', 'tv', 'looking', 'movie', 'sweet', 'like', 'transfer', 'zip', 'lock', 'baggie', 'stay', 'fresh', 'take', 'time', 'eating']]

Cluster 1 examples:
[['got', 'mum', 'diabetic', 'need', 'watch', 'sugar', 'intake', 'father', 'simply', 'chooses', 'limit', 'unnecessary', 'sugar', 'intake', 'shes', 'one', 'sweet', 'tooth', 'loved', 'toffee', 'would', 'never', 'guess', 'theyre', 'sugarfree', 'great', 'eat', 'pretty', 'much', 'guilt', 'free', 'impressed', 'ive', 'ordered', 'w', 'dark', 'chocolate', 'take', 'office', 'ill', 'eat', 'instead', 'snacking', 'sugary', 'sweetsbr', 'excellent'], ['chocolate', 'say', 'great', 'variety', 'everything'

In [10]:
print(FFR.dataset["Cluster"].value_counts())

Cluster
3    230910
0    126523
4     51227
9     39142
2     36724
1     25311
8     17849
6     14224
5     13536
7     13008
Name: count, dtype: int64


In [11]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, alpha=0.5)
plt.title("PCA visualization of clusters")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

: 