# DATA 612 FINAL PROJECT
Amazon Product Recommender Model Using Reviews

* Farhod Ibragimov
* Gillian McGovern

## Objective

Create an offline Amazon product (specifically Amazon appliances) recommender model using user ratings and reviews.

## Data Sources

Source: https://amazon-reviews-2023.github.io/

This is a large-scale Amazon Reviews dataset, collected in 2023 by McAuley Lab, and it includes rich features such as:

*   User Reviews (ratings, text, helpfulness votes, etc.);
*   Item Metadata (descriptions, price, raw image, etc.);
*   Links (user-item / bought together graphs).


User review structure can be found [here](https://amazon-reviews-2023.github.io/#for-user-reviews) and item metadata structure can be found [here](https://amazon-reviews-2023.github.io/#for-item-metadata).

We will be specifically looking at the Appliances category of products, which includes:

* 1.8M Users
* 94.3K Appliances
* 2.1M Ratings/Reviews

The original data is in JSON format.

## Read in the Data

In [None]:
import os
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import torch
import pyarrow as pa
import pyarrow.parquet as pq
from pyspark.sql import SparkSession
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from sklearn.neighbors import NearestNeighbors
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import floor, round, monotonically_increasing_id, col

In [None]:


if False:
    # CONFIG 
    REVIEW_PATH = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\Appliances.jsonl"
    META_PATH   = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\meta_Appliances.jsonl"
    OUT_DIR     = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output"
    CHUNK_SIZE  = 200_000   # tune based on your machine's RAM
    SEED        = 42


    os.makedirs(OUT_DIR, exist_ok=True)

    #LOAD METADATA 
    meta_pd = pd.read_json(
        r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\meta_Appliances.jsonl",
        lines=True
    )[["parent_asin", "average_rating", "rating_number"]]

    writers = {"train": None, "valid": None, "test": None}

    # STREAM, FILTER, SPLIT, AND WRITE
    for chunk in pd.read_json(
        r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\Appliances.jsonl",
        lines=True,
        chunksize=CHUNK_SIZE
    ):
        # 1) keep needed cols + timestamp
        chunk = chunk[["user_id", "parent_asin", "rating", "timestamp", "text"]]

        # 2) filter to years 2021–2023
        dt = pd.to_datetime(chunk["timestamp"], unit="ms")
        mask_year = dt.dt.year.between(2021, 2023)
        chunk = chunk.loc[mask_year]
        if chunk.empty:
            continue

        # 3) assign random float for splitting
        rng = np.random.RandomState(SEED)
        chunk["_rand"] = rng.rand(len(chunk))

        # 4) merge metadata
        chunk = chunk.merge(meta_pd, on="parent_asin", how="left")

        # 5) define split masks
        masks = {
            "train": chunk["_rand"] <  0.8,
            "valid": (chunk["_rand"] >= 0.8) & (chunk["_rand"] < 0.9),
            "test":  chunk["_rand"] >= 0.9
        }

        # 6) write each split to its Parquet
        for split, m in masks.items():
            sub = chunk.loc[m, [
                "user_id",
                "parent_asin",
                "rating",
                "text",
                "average_rating",
                "rating_number"
            ]]
            if sub.empty:
                continue
            tbl = pa.Table.from_pandas(sub, preserve_index=False)
            path = os.path.join(OUT_DIR, f"{split}.parquet")
            if writers[split] is None:
                writers[split] = pq.ParquetWriter(path, schema=tbl.schema)
            writers[split].write_table(tbl)

    # close Parquet writers
    for w in writers.values():
        if w:
            w.close()

    print("Finished writing filtered splits to:", OUT_DIR)
    print("  •", os.path.join(OUT_DIR, "train.parquet"))
    print("  •", os.path.join(OUT_DIR, "valid.parquet"))
    print("  •", os.path.join(OUT_DIR, "test.parquet"))


In [2]:
if False:    
    import pyarrow.parquet as pq

    for split in ("train", "valid", "test"):
        path = rf"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\{split}.parquet"
        pf = pq.ParquetFile(path)
        print(f"{split.capitalize()} split: {pf.metadata.num_rows} reviews")


In [3]:
    
if False:
    spark = SparkSession.builder \
        .appName("CheckRatingRange") \
        .config("spark.driver.memory","4g") \
        .getOrCreate()

    # Load just the rating column from the train split
    train = spark.read.parquet(
        r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\train.parquet"
    ).select("rating")

    # 1) min & max
    train.selectExpr("min(rating) AS min_rating", "max(rating) AS max_rating") \
        .show()

    # 2) basic summary (mean, stddev, etc.)
    train.describe("rating").show()

    # 3) breakdown by rating value
    train.groupBy("rating").count().orderBy("rating").show(5, truncate=False)

    spark.stop()


## BERT Content Based Recommender Model

### Create BERT Embeddings

In [4]:
#DO NOT RUN THIS CELL UNLESS EMBEDINGS ARE NEEDED
if False:
    import os
    import torch
    import pyarrow as pa
    import pyarrow.parquet as pq
    from pyspark.sql import SparkSession
    from sentence_transformers import SentenceTransformer
    from huggingface_hub import hf_hub_download

    HF_REPO      = "glavvrach79/my-recsys-data"
    HF_SUBFOLDER = "output/embeddings"
    FNAME        = "item_embeddings.parquet"

    LOCAL_TRAIN  = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\train.parquet"
    EMB_OUT      = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\embeddings\item_embeddings.parquet"

    DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
    SAMPLE_PER   = 5
    BATCH_SIZE   = 64

    os.makedirs(os.path.dirname(EMB_OUT), exist_ok=True)
    print(f"Using device: {DEVICE}\n")

    def is_good_parquet(path: str) -> bool:
        """Try a light PyArrow read to confirm the file is a valid Parquet."""
        try:
            pq.read_table(path, columns=[])  # only footer
            return True
        except Exception:
            return False

    downloaded = False

    # if a local copy already exists & is valid, use it 
    if os.path.exists(EMB_OUT) and is_good_parquet(EMB_OUT):
        print(f"Found valid local embeddings → {EMB_OUT}\n")
        downloaded = True

    # else try downloading from HF 
    if not downloaded:
        try:
            hf_path = hf_hub_download(
                repo_id=HF_REPO,
                repo_type="dataset",
                subfolder=HF_SUBFOLDER,
                filename=FNAME,
            )
            print(f"📦 Downloaded HF embeddings → {hf_path}")
            if is_good_parquet(hf_path):
                print("HF parquet loads OK. Using HF version.\n")
                EMB_OUT = hf_path
                downloaded = True
            else:
                print("HF parquet is corrupt—will recompute locally.\n")
        except Exception as e:
            print(f"Could not fetch from HF ({e!r}); will compute locally.\n")

    # if still not obtained, recompute locally 
    if not downloaded:
        print("▶️ Computing embeddings locally…\n")
        spark = SparkSession.builder \
            .appName("ComputeItemEmbeddings") \
            .config("spark.driver.memory","16g") \
            .getOrCreate()

        df = spark.read.parquet(LOCAL_TRAIN).select("parent_asin", "text")
        pdf = df.toPandas()
        spark.stop()

        model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
        writer = None

        for pid, group in pdf.groupby("parent_asin", sort=False):
            samples = group["text"].sample(
                n=min(len(group), SAMPLE_PER),
                random_state=42
            ).tolist()

            emb_batch = model.encode(
                samples,
                batch_size=BATCH_SIZE,
                show_progress_bar=False,
                convert_to_numpy=True
            )
            mean_emb = emb_batch.mean(axis=0)

            data = {"parent_asin": [pid]}
            for i, val in enumerate(mean_emb):
                data[f"emb_{i}"] = [float(val)]
            table = pa.Table.from_pydict(data)

            if writer is None:
                writer = pq.ParquetWriter(EMB_OUT, schema=table.schema)
            writer.write_table(table)

        if writer:
            writer.close()

        print(f"✅ Wrote item embeddings locally → {EMB_OUT}\n")

    # preview final 
    print(f" Final embeddings path → {EMB_OUT}\n")
    spark = SparkSession.builder \
        .appName("PreviewEmbeddings") \
        .config("spark.driver.memory","4g") \
        .getOrCreate()

    def try_spark_read(path: str):
        try:
            return spark.read.parquet(path)
        except Exception as e:
            print(f"Spark read failed on `{path}`:\n   {e}")
            return None

    emb_df = try_spark_read(EMB_OUT)

    # ── If Spark failed on the HF‐cached file, fall back to pandas → Spark ──────
    if emb_df is None:
        import pandas as pd
        print("Falling back to pandas.read_parquet …")
        pdf = pd.read_parquet(EMB_OUT)
        emb_df = spark.createDataFrame(pdf)

    emb_df.show(5, truncate=50)
    print("Total items:", emb_df.count())
    spark.stop()


In [None]:
# CONFIG 
HF_REPO      = "glavvrach79/my-recsys-data"
HF_SUBFOLDER_KNN = "output"
HF_SUBFOLDER = "output/embeddings"
FNAME        = "item_embeddings.parquet"
TRAIN_NAME   = "train.parquet"
TEST_NAME  = "test.parquet"
VALID_NAME = "valid.parquet"
FULL_REVIEW_NAME = "full_review.parquet"
KNN_NAME = "knn_cache.parquet"

META_PATH    = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\meta_Appliances.jsonl"
TOP_K        = 5
OUT_PATH     = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\knn_cache.parquet"

#os.makedirs(os.path.dirname(LOCAL_OUT), exist_ok=True)

def load_or_build_cache():
    # try HF download + read
    try:
        hf_cache = hf_hub_download(
            repo_id=HF_REPO,
            repo_type="dataset",
            subfolder=HF_SUBFOLDER_KNN,
            filename=KNN_NAME,
        )
        print("Downloaded KNN cache from HF:", hf_cache)
        cache = pd.read_parquet(hf_cache)
        print("Loaded cache from HF!")
        return cache
    except Exception as e:
        print(f"HF cache unavailable ({e}); building locally…")

    
    # download embeddings
    hf_emb = hf_hub_download(
        repo_id=HF_REPO,
        repo_type="dataset",
        subfolder=HF_SUBFOLDER,
        filename=FNAME,
    )
    df_emb = pd.read_parquet(hf_emb)

    # metadata
    df_meta = (
        pd.read_json(META_PATH, lines=True)
          .loc[:, ["parent_asin", "title"]]
          .rename(columns={"parent_asin":"item_id", "title":"product_title"})
    )

    df = (
        df_emb.rename(columns={"parent_asin":"item_id"})
              .merge(df_meta, on="item_id", how="left")
    )

    X        = df.filter(regex="^emb_").values
    item_ids = df["item_id"].to_numpy()
    titles   = df["product_title"].to_numpy()

    nn = NearestNeighbors(n_neighbors=TOP_K+1, metric="cosine")
    nn.fit(X)
    distances, indices = nn.kneighbors(X)

    title_map = df.set_index("item_id")["product_title"]

    cache = pd.DataFrame({
        "item_id":        np.repeat(item_ids, TOP_K),
        "item_title":     np.repeat(title_map[item_ids].values, TOP_K),      # ← add this
        "rank":           np.tile(np.arange(1, TOP_K+1), len(item_ids)),
        "neighbor_id":    item_ids[indices[:, 1:]].ravel(),
        "neighbor_title": titles[indices[:, 1:]].ravel(),
        "distance":       distances[:, 1:].ravel(),
    })
    cache.to_parquet(OUT_PATH, index=False)
    print(f"file saved to {OUT_PATH}")
    
    return cache


knn_cache = load_or_build_cache()

Downloaded KNN cache from HF: C:\Users\farho\.cache\huggingface\hub\datasets--glavvrach79--my-recsys-data\snapshots\216e868cbbb098622e859a3b1275a215acc20122\output\knn_cache.parquet
Loaded cache from HF!


### Create Content Based Model Using BERT Embeddings

In [6]:
import pandas as pd


cache = knn_cache

# load the meta table so we can lookup item titles
meta = (
    pd.read_json(META_PATH, lines=True)
      .rename(columns={"parent_asin":"item_id", "title":"product_title"})
      .set_index("item_id")["product_title"]
)

# pick a random item_id
random_id = cache["item_id"].drop_duplicates().sample(1, random_state=42).iloc[0]
random_title = meta[random_id]

# pull its top‑K neighbors from the cache
neighbors = (
    cache[cache["item_id"] == random_id]
      .sort_values("rank")
      .loc[:, ["neighbor_id","neighbor_title","distance"]]
)

print(f"🎲 Randomly selected: {random_id} → {random_title}\n")
print("Top similar items:")
print(neighbors.to_string(index=False))


🎲 Randomly selected: B07BTQKG5J → [UPGRADED] DC47-00019A Samsung Dryer Heating Element & DC96-00887A Thermal Fuse & DC47-00018A Thermostat COMPLETE Dryer Repair Kit Replacement by BlueStars - Exact Fit For Samsung & Kenmore Dryers

Top similar items:
neighbor_id                                                                                                                                                                            neighbor_title  distance
 B0B8MFHWT2    279973 & 3392519 & 8577274 Dryer Thermal Cut-Off Fuse Kit with Thermistor Control and Thermal Fuse by Blutoget -Compatible for Whirlpool, Ken-more, Sam-sung, May-tag electric Dryers.  0.248161
 B09TT3WN9H                                                279838 Heating Element for Kenmore Dryer Compatible Whirlpool Wed4815ew1 Heating Elements Roper Dryer Parts 500 600 70 80 Series Model 110  0.259680
 B078RF452M                      MAYITOP Compatible Dryer Heating Element for Samsung DV42H5200EP/A3-0000, Samsung DV361EWBEW

## Collaborative Filtering Spark Model (ALS)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from huggingface_hub import hf_hub_download

spark = SparkSession.builder \
    .appName("ALSPrep") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# download the reviews parquet from HF
hf_path = hf_hub_download(
    repo_id="glavvrach79/my-recsys-data",
    repo_type="dataset",
    filename="full_review.parquet",
)

# read it directly with spark.read.parquet
als_full_df = spark.read.parquet(hf_path).cache()

#sanity‐check the schema and small sample
als_full_df.printSchema()
als_full_df.limit(5).show()


root
 |-- user_id: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: long (nullable = true)

+--------------------+-----------+------+
|             user_id|parent_asin|rating|
+--------------------+-----------+------+
|AGKHLEW2SOWHNMFQI...| B01N0TQ0OH|     5|
|AHWWLSPCJMALVHDDV...| B07DD37QPZ|     5|
|AHZIJGKEWRTAEOZ67...| B082W3Z9YK|     5|
|AFGUPTDFAWOHHL4LZ...| B078W2BJY8|     5|
|AELFJFAXQERUSMTXJ...| B08C9LPCQV|     5|
+--------------------+-----------+------+



In [8]:
# distinct users + assign stable integer IDs
users = (als_full_df
         .select("user_id")
         .distinct()
         .coalesce(1)
         .withColumn("userIntId", monotonically_increasing_id())
         .cache())

# distinct products + assign stable integer IDs
products = (als_full_df
            .select("parent_asin")
            .distinct()
            .coalesce(1)
            .withColumn("productIntId", monotonically_increasing_id())
            .cache())

# join them back
als_df_int_ids = (als_full_df
                  .join(users,    "user_id",     "left")
                  .join(products, "parent_asin", "left"))

# show a small sample
als_df_int_ids.limit(5).show()


+-----------+--------------------+------+---------+------------+
|parent_asin|             user_id|rating|userIntId|productIntId|
+-----------+--------------------+------+---------+------------+
| B0053F9NY6|AE5AWKLOJHY2E5HEW...|     5|    12645|       27016|
| B0C6FDT9L2|AEIMWAEJUVTMJRYIQ...|     1|    87136|       55207|
| B00Q4X2GDQ|AEVEMRP62HZ7JWQ3N...|     5|    36587|       55622|
| B00Q4X2FSM|AEVEMRP62HZ7JWQ3N...|     5|    36587|       51534|
| B07GRB1BKD|AEXOHXABEMAGG225Y...|     5|   135632|       11256|
+-----------+--------------------+------+---------+------------+



In [9]:
# Split the ratings into training and test data
als_df_final = als_df_int_ids.select(col("userIntId").alias("userId"), col("productIntId").alias("productId"), col("rating"), col('parent_asin'))
als_df_final_cached = als_df_final.cache()

(training_data, test_data) = als_df_final_cached.randomSplit([0.7, 0.3], seed=42)
test_data_cached = test_data.cache()
training_data_cached = training_data.cache()

In [10]:
# Create model without any hyperparameter tuning

# Set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(userCol="userId", itemCol="productId", ratingCol="rating", rank = 10, maxIter = 15, regParam = .1,
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)

# Fit the model to the training_data
model = als.fit(training_data_cached)

# Generate predictions on the test_data
test_predictions = model.transform(test_data_cached)
preds = test_predictions.select("userId","productId","rating","prediction")

# preview only the first 10 rows
preds.show(10, truncate=False)

# now it’s safe to shut Spark down
spark.stop()

+------+---------+------+----------+
|userId|productId|rating|prediction|
+------+---------+------+----------+
|28    |4748     |4     |3.6311698 |
|28    |5684     |5     |3.3890977 |
|28    |70604    |4     |1.3954598 |
|148   |3331     |5     |3.846614  |
|148   |21264    |5     |3.665117  |
|155   |70608    |1     |3.215338  |
|183   |53764    |4     |2.090805  |
|211   |10331    |5     |3.868464  |
|385   |36806    |4     |3.0781858 |
|496   |139      |4     |1.5207388 |
+------+---------+------+----------+
only showing top 10 rows


In [11]:
# TO DO: hyperparameter tuning

# # Hyperparameter Tuning
#
# # Use pyspark grid search
# param_grid = ParamGridBuilder() \
#            .addGrid(als.rank, [10, 50, 75, 100]) \
#            .addGrid(als.maxIter, [10]) \
#            .addGrid(als.regParam, [.05, .1, .15]) \
#            .build()
#
# # Create RMSE evaluator
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
#
# # Use cross validation
# cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, collectSubModels=True)
#
# # Checkpoint the training data to truncate its lineage.
# # This is a lazy operation, it will be triggered by the .fit() call.
# training_data_chkp = training_data_cached.checkpoint()
#
# # Fit the cross validator on the CHECKPOINTED DataFrame.
# model = cv.fit(training_data_chkp)
#
# # Best model
# best_model = model.bestModel
#
# # Average RMSE for each model
# avg_rmse_models = model.avgMetrics
#
# display(f"{len(param_grid)} models tested")