# DATA 612 FINAL PROJECT
Amazon Product Recommender Model Using Reviews

* Farhod Ibragimov
* Gillian McGovern

## Objective

Create an offline Amazon product (specifically Amazon appliances) recommender model using user ratings and reviews.

## Data Sources

Source: https://amazon-reviews-2023.github.io/

This is a large-scale Amazon Reviews dataset, collected in 2023 by McAuley Lab, and it includes rich features such as:

*   User Reviews (ratings, text, helpfulness votes, etc.);
*   Item Metadata (descriptions, price, raw image, etc.);
*   Links (user-item / bought together graphs).


User review structure can be found [here](https://amazon-reviews-2023.github.io/#for-user-reviews) and item metadata structure can be found [here](https://amazon-reviews-2023.github.io/#for-item-metadata).

We will be specifically looking at the Appliances category of products, which includes:

* 1.8M Users
* 94.3K Appliances
* 2.1M Ratings/Reviews

The original data is in JSON format.

## Read in the Data

In [180]:
import os
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import torch
import pyarrow as pa
import pyarrow.parquet as pq
from pyspark.sql import SparkSession
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from sklearn.neighbors import NearestNeighbors
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import floor, round, monotonically_increasing_id, col
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:


if False:
    # CONFIG 
    REVIEW_PATH = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\Appliances.jsonl"
    META_PATH   = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\meta_Appliances.jsonl"
    OUT_DIR     = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output"
    CHUNK_SIZE  = 200_000   # tune based on your machine's RAM
    SEED        = 42


    os.makedirs(OUT_DIR, exist_ok=True)

    #LOAD METADATA 
    meta_pd = pd.read_json(
        r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\meta_Appliances.jsonl",
        lines=True
    )[["parent_asin", "average_rating", "rating_number"]]

    writers = {"train": None, "valid": None, "test": None}

    # STREAM, FILTER, SPLIT, AND WRITE
    for chunk in pd.read_json(
        r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\Appliances.jsonl",
        lines=True,
        chunksize=CHUNK_SIZE
    ):
        # 1) keep needed cols + timestamp
        chunk = chunk[["user_id", "parent_asin", "rating", "timestamp", "text"]]

        # 2) filter to years 2021–2023
        dt = pd.to_datetime(chunk["timestamp"], unit="ms")
        mask_year = dt.dt.year.between(2021, 2023)
        chunk = chunk.loc[mask_year]
        if chunk.empty:
            continue

        # 3) assign random float for splitting
        rng = np.random.RandomState(SEED)
        chunk["_rand"] = rng.rand(len(chunk))

        # 4) merge metadata
        chunk = chunk.merge(meta_pd, on="parent_asin", how="left")

        # 5) define split masks
        masks = {
            "train": chunk["_rand"] <  0.8,
            "valid": (chunk["_rand"] >= 0.8) & (chunk["_rand"] < 0.9),
            "test":  chunk["_rand"] >= 0.9
        }

        # 6) write each split to its Parquet
        for split, m in masks.items():
            sub = chunk.loc[m, [
                "user_id",
                "parent_asin",
                "rating",
                "text",
                "average_rating",
                "rating_number"
            ]]
            if sub.empty:
                continue
            tbl = pa.Table.from_pandas(sub, preserve_index=False)
            path = os.path.join(OUT_DIR, f"{split}.parquet")
            if writers[split] is None:
                writers[split] = pq.ParquetWriter(path, schema=tbl.schema)
            writers[split].write_table(tbl)

    # close Parquet writers
    for w in writers.values():
        if w:
            w.close()

    print("Finished writing filtered splits to:", OUT_DIR)
    print("  •", os.path.join(OUT_DIR, "train.parquet"))
    print("  •", os.path.join(OUT_DIR, "valid.parquet"))
    print("  •", os.path.join(OUT_DIR, "test.parquet"))


In [2]:
if False:    
    import pyarrow.parquet as pq

    for split in ("train", "valid", "test"):
        path = rf"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\{split}.parquet"
        pf = pq.ParquetFile(path)
        print(f"{split.capitalize()} split: {pf.metadata.num_rows} reviews")


In [3]:
    
if False:
    spark = SparkSession.builder \
        .appName("CheckRatingRange") \
        .config("spark.driver.memory","4g") \
        .getOrCreate()

    # Load just the rating column from the train split
    train = spark.read.parquet(
        r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\train.parquet"
    ).select("rating")

    # 1) min & max
    train.selectExpr("min(rating) AS min_rating", "max(rating) AS max_rating") \
        .show()

    # 2) basic summary (mean, stddev, etc.)
    train.describe("rating").show()

    # 3) breakdown by rating value
    train.groupBy("rating").count().orderBy("rating").show(5, truncate=False)

    spark.stop()


## BERT Content Based Recommender Model

### Create BERT Embeddings

In [4]:
#DO NOT RUN THIS CELL UNLESS EMBEDDINGS ARE NEEDED
if False:
    import os
    import torch
    import pyarrow as pa
    import pyarrow.parquet as pq
    from pyspark.sql import SparkSession
    from sentence_transformers import SentenceTransformer
    from huggingface_hub import hf_hub_download

    HF_REPO      = "glavvrach79/my-recsys-data"
    HF_SUBFOLDER = "output/embeddings"
    FNAME        = "item_embeddings.parquet"

    LOCAL_TRAIN  = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\train.parquet"
    EMB_OUT      = r"C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\embeddings\item_embeddings.parquet"

    DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
    SAMPLE_PER   = 5
    BATCH_SIZE   = 64

    os.makedirs(os.path.dirname(EMB_OUT), exist_ok=True)
    print(f"Using device: {DEVICE}\n")

    def is_good_parquet(path: str) -> bool:
        """Try a light PyArrow read to confirm the file is a valid Parquet."""
        try:
            pq.read_table(path, columns=[])  # only footer
            return True
        except Exception:
            return False

    downloaded = False

    # if a local copy already exists & is valid, use it 
    if os.path.exists(EMB_OUT) and is_good_parquet(EMB_OUT):
        print(f"Found valid local embeddings → {EMB_OUT}\n")
        downloaded = True

    # else try downloading from HF 
    if not downloaded:
        try:
            hf_path = hf_hub_download(
                repo_id=HF_REPO,
                repo_type="dataset",
                subfolder=HF_SUBFOLDER,
                filename=FNAME,
            )
            print(f"📦 Downloaded HF embeddings → {hf_path}")
            if is_good_parquet(hf_path):
                print("HF parquet loads OK. Using HF version.\n")
                EMB_OUT = hf_path
                downloaded = True
            else:
                print("HF parquet is corrupt—will recompute locally.\n")
        except Exception as e:
            print(f"Could not fetch from HF ({e!r}); will compute locally.\n")

    # if still not obtained, recompute locally 
    if not downloaded:
        print("▶️ Computing embeddings locally…\n")
        spark = SparkSession.builder \
            .appName("ComputeItemEmbeddings") \
            .config("spark.driver.memory","16g") \
            .getOrCreate()

        df = spark.read.parquet(LOCAL_TRAIN).select("parent_asin", "text")
        pdf = df.toPandas()
        spark.stop()

        model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
        writer = None

        for pid, group in pdf.groupby("parent_asin", sort=False):
            samples = group["text"].sample(
                n=min(len(group), SAMPLE_PER),
                random_state=42
            ).tolist()

            emb_batch = model.encode(
                samples,
                batch_size=BATCH_SIZE,
                show_progress_bar=False,
                convert_to_numpy=True
            )
            mean_emb = emb_batch.mean(axis=0)

            data = {"parent_asin": [pid]}
            for i, val in enumerate(mean_emb):
                data[f"emb_{i}"] = [float(val)]
            table = pa.Table.from_pydict(data)

            if writer is None:
                writer = pq.ParquetWriter(EMB_OUT, schema=table.schema)
            writer.write_table(table)

        if writer:
            writer.close()

        print(f"✅ Wrote item embeddings locally → {EMB_OUT}\n")

    # preview final 
    print(f" Final embeddings path → {EMB_OUT}\n")
    spark = SparkSession.builder \
        .appName("PreviewEmbeddings") \
        .config("spark.driver.memory","4g") \
        .getOrCreate()

    def try_spark_read(path: str):
        try:
            return spark.read.parquet(path)
        except Exception as e:
            print(f"Spark read failed on `{path}`:\n   {e}")
            return None

    emb_df = try_spark_read(EMB_OUT)

    # ── If Spark failed on the HF‐cached file, fall back to pandas → Spark ──────
    if emb_df is None:
        import pandas as pd
        print("Falling back to pandas.read_parquet …")
        pdf = pd.read_parquet(EMB_OUT)
        emb_df = spark.createDataFrame(pdf)

    emb_df.show(5, truncate=50)
    print("Total items:", emb_df.count())
    spark.stop()


In [169]:
df_meta = (
        pd.read_json(META_PATH, lines=True)
    )
df_meta.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Industrial & Scientific,"ROVSUN Ice Maker Machine Countertop, Make 44lb...",3.7,61,[【Quick Ice Making】This countertop ice machine...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Our Point of View on the Euhomy Ic...,ROVSUN,"[Appliances, Refrigerators, Freezers & Ice Mak...","{'Brand': 'ROVSUN', 'Model Name': 'ICM-2005', ...",B08Z743RRD,,,
1,Tools & Home Improvement,"HANSGO Egg Holder for Refrigerator, Deviled Eg...",4.2,75,"[Plastic, Practical Kitchen Storage - Our egg ...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': '10 Eggs Egg Holder for Refrigerato...,HANSGO,"[Appliances, Parts & Accessories, Refrigerator...","{'Manufacturer': 'HANSGO', 'Part Number': 'HAN...",B097BQDGHJ,,,
2,Tools & Home Improvement,"Clothes Dryer Drum Slide, General Electric, Ho...",3.5,18,[],"[Brand new dryer drum slide, replaces General ...",,[{'thumb': 'https://m.media-amazon.com/images/...,[],GE,"[Appliances, Parts & Accessories]","{'Manufacturer': 'RPI', 'Part Number': 'WE1M33...",B00IN9AGAE,,,
3,Tools & Home Improvement,154567702 Dishwasher Lower Wash Arm Assembly f...,4.5,26,[MODEL NUMBER:154567702 Dishwasher Lower Wash ...,[MODEL NUMBER:154567702 Dishwasher Lower Wash ...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],folosem,"[Appliances, Parts & Accessories, Dryer Parts ...","{'Manufacturer': 'folosem', 'Part Number': '15...",B0C7K98JZS,,,
4,Tools & Home Improvement,Whirlpool W10918546 Igniter,3.8,12,[This is a Genuine OEM Replacement Part.],[Whirlpool Igniter],25.07,[{'thumb': 'https://m.media-amazon.com/images/...,[],Whirlpool,"[Appliances, Parts & Accessories]","{'Manufacturer': 'Whirlpool', 'Part Number': '...",B07QZHQTVJ,,,


In [5]:
# CONFIG 
HF_REPO      = "glavvrach79/my-recsys-data"
HF_SUBFOLDER_KNN = "output"
HF_SUBFOLDER = "output/embeddings"
FNAME        = "item_embeddings.parquet"
TRAIN_NAME   = "train.parquet"
TEST_NAME  = "test.parquet"
VALID_NAME = "valid.parquet"
FULL_REVIEW_NAME = "full_review.parquet"
KNN_NAME = "knn_cache.parquet"

META_PATH = os.getenv('META_PATH', "C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\data\meta_Appliances.jsonl")

TOP_K        = 5
OUT_PATH = os.getenv('OUT_PATH', "C:\CUNY_MSDS\DATA612\AMAZON_PROJECT\output\knn_cache.parquet")

#os.makedirs(os.path.dirname(LOCAL_OUT), exist_ok=True)

def load_or_build_cache():
    # try HF download + read
    try:
        hf_cache = hf_hub_download(
            repo_id=HF_REPO,
            repo_type="dataset",
            subfolder=HF_SUBFOLDER_KNN,
            filename=KNN_NAME,
        )
        print("Downloaded KNN cache from HF:", hf_cache)
        cache = pd.read_parquet(hf_cache)
        print("Loaded cache from HF!")
        return cache
    except Exception as e:
        print(f"HF cache unavailable ({e}); building locally…")

    
    # download embeddings
    hf_emb = hf_hub_download(
        repo_id=HF_REPO,
        repo_type="dataset",
        subfolder=HF_SUBFOLDER,
        filename=FNAME,
    )
    df_emb = pd.read_parquet(hf_emb)

    # metadata
    df_meta = (
        pd.read_json(f'r{META_PATH}', lines=True)
          .loc[:, ["parent_asin", "title"]]
          .rename(columns={"parent_asin":"item_id", "title":"product_title"})
    )

    df = (
        df_emb.rename(columns={"parent_asin":"item_id"})
              .merge(df_meta, on="item_id", how="left")
    )

    X        = df.filter(regex="^emb_").values
    item_ids = df["item_id"].to_numpy()
    titles   = df["product_title"].to_numpy()

    nn = NearestNeighbors(n_neighbors=TOP_K+1, metric="cosine")
    nn.fit(X)
    distances, indices = nn.kneighbors(X)

    title_map = df.set_index("item_id")["product_title"]

    cache = pd.DataFrame({
        "item_id":        np.repeat(item_ids, TOP_K),
        "item_title":     np.repeat(title_map[item_ids].values, TOP_K),      # ← add this
        "rank":           np.tile(np.arange(1, TOP_K+1), len(item_ids)),
        "neighbor_id":    item_ids[indices[:, 1:]].ravel(),
        "neighbor_title": titles[indices[:, 1:]].ravel(),
        "distance":       distances[:, 1:].ravel(),
    })
    cache.to_parquet(OUT_PATH, index=False)
    print(f"file saved to {OUT_PATH}")
    
    return cache


knn_cache = load_or_build_cache()

Downloaded KNN cache from HF: /Users/gillianmcgovern/.cache/huggingface/hub/datasets--glavvrach79--my-recsys-data/snapshots/216e868cbbb098622e859a3b1275a215acc20122/output/knn_cache.parquet
Loaded cache from HF!


### Create Content Based Model Using BERT Embeddings

In [168]:
meta.head()

item_id
B08Z743RRD    ROVSUN Ice Maker Machine Countertop, Make 44lb...
B097BQDGHJ    HANSGO Egg Holder for Refrigerator, Deviled Eg...
B00IN9AGAE    Clothes Dryer Drum Slide, General Electric, Ho...
B0C7K98JZS    154567702 Dishwasher Lower Wash Arm Assembly f...
B07QZHQTVJ                          Whirlpool W10918546 Igniter
Name: product_title, dtype: object

In [140]:
import pandas as pd
from tabulate import tabulate

CACHE = knn_cache

# load the meta table so we can lookup item titles
meta = (
    pd.read_json(META_PATH, lines=True)
      .rename(columns={"parent_asin":"item_id", "title":"product_title"})
      .set_index("item_id")["product_title"]
)

def predict_using_bert(item_id):
    # pull its top‑K neighbors from the cache
    neighbors = (
        CACHE[CACHE["item_id"] == item_id]
            .sort_values("rank")
            .loc[:, ["neighbor_id","neighbor_title","distance"]]
    )
    return neighbors

random_id = CACHE["item_id"].drop_duplicates().sample(1, random_state=42).iloc[0]
random_title = meta[random_id]
neighbors = predict_using_bert(random_id)

print(f"🎲 Randomly selected: {random_id} → {random_title}\n")
print("Top similar items:")
print(neighbors.to_string(index=False, float_format="{:.4f}".format, justify='left', max_colwidth=150))

neighbors.to_parquet('BERT_similar_items.parquet', engine='pyarrow', compression='snappy')

🎲 Randomly selected: B07BTQKG5J → [UPGRADED] DC47-00019A Samsung Dryer Heating Element & DC96-00887A Thermal Fuse & DC47-00018A Thermostat COMPLETE Dryer Repair Kit Replacement by BlueStars - Exact Fit For Samsung & Kenmore Dryers

Top similar items:
neighbor_id neighbor_title                                                                                                                                          distance
B0B8MFHWT2  279973 & 3392519 & 8577274 Dryer Thermal Cut-Off Fuse Kit with Thermistor Control and Thermal Fuse by Blutoget -Compatible for Whirlpool, Ken-more,... 0.2482   
B09TT3WN9H              279838 Heating Element for Kenmore Dryer Compatible Whirlpool Wed4815ew1 Heating Elements Roper Dryer Parts 500 600 70 80 Series Model 110 0.2597   
B078RF452M  MAYITOP Compatible Dryer Heating Element for Samsung DV42H5200EP/A3-0000, Samsung DV361EWBEWR/A3-0001, Samsung DV219AEB/XAA-0000, Samsung DV393ETPA... 0.2622   
B082PJ9BWH           137114000 Dryer Heating Element Kit,

In [170]:
neighbors.shape

(5, 3)

## Collaborative Filtering Spark Model (ALS)

### Create Spark Session

In [183]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import monotonically_increasing_id
from huggingface_hub import hf_hub_download
conf = SparkConf()
conf.setMaster("local[*]").setAppName("SparkALS")
conf.set("spark.executor.memory", "16g")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.sparkContext.setCheckpointDir("checkpoint_dir_als")

# spark = SparkSession.builder \
#     .appName("ALSPrep") \
#     .config("spark.driver.memory", "16g") \
#     .getOrCreate()

# download the reviews parquet from HF
hf_path = hf_hub_download(
    repo_id="glavvrach79/my-recsys-data",
    repo_type="dataset",
    filename="full_review.parquet",
)

# read it directly with spark.read.parquet
als_full_df = spark.read.parquet(hf_path).cache()

#sanity‐check the schema and small sample
als_full_df.printSchema()
als_full_df.limit(5).show()


25/07/17 20:51:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- user_id: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: long (nullable = true)



[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+-----------+------+
|             user_id|parent_asin|rating|
+--------------------+-----------+------+
|AGKHLEW2SOWHNMFQI...| B01N0TQ0OH|     5|
|AHWWLSPCJMALVHDDV...| B07DD37QPZ|     5|
|AHZIJGKEWRTAEOZ67...| B082W3Z9YK|     5|
|AFGUPTDFAWOHHL4LZ...| B078W2BJY8|     5|
|AELFJFAXQERUSMTXJ...| B08C9LPCQV|     5|
+--------------------+-----------+------+



                                                                                

In [184]:
META_PATH = r"data/meta_Appliances.jsonl"
meta = (
    pd.read_json(META_PATH, lines=True)
)
meta = meta[['parent_asin', 'title']]
meta = spark.createDataFrame(meta)

### Set Up Data for Spark ALS

In [185]:
# distinct users + assign stable integer IDs
users = (als_full_df
         .select("user_id")
         .distinct()
         .coalesce(1)
         .withColumn("userIntId", monotonically_increasing_id())
         .cache())

# distinct products + assign stable integer IDs
products = (als_full_df
            .select("parent_asin")
            .distinct()
            .coalesce(1)
            .withColumn("productIntId", monotonically_increasing_id())
            .cache())

# join them back
als_df_int_ids = (als_full_df
                  .join(users,    "user_id",     "left")
                  .join(products, "parent_asin", "left"))

als_df_int_ids_cached = als_df_int_ids.cache()
als_df_int_ids_cached = als_df_int_ids_cached.join(meta.select(col("parent_asin"), col("title")), on=['parent_asin'])

# show a small sample
als_df_int_ids_cached.limit(5).show()


25/07/17 20:52:21 WARN TaskSetManager: Stage 19 contains a task of very large size (1361 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+-----------+--------------------+------+---------+------------+--------------------+
|parent_asin|             user_id|rating|userIntId|productIntId|               title|
+-----------+--------------------+------+---------+------------+--------------------+
| B00002N7HY|AF7LBPZDD75YWNVPX...|     5|   115273|        4824|Leviton 5050 50 A...|
| B00002N7HY|AEHB3PLJAATO6456H...|     5|  1063587|        4824|Leviton 5050 50 A...|
| B00002N7HY|AEVQF4T2AFLPYBO25...|     5|  1092242|        4824|Leviton 5050 50 A...|
| B00002N7HY|AF7B74NC2WNXBHEVA...|     5|  1362929|        4824|Leviton 5050 50 A...|
| B00002N7HY|AE25KDKKEX3QMWVQ3...|     5|  1434114|        4824|Leviton 5050 50 A...|
+-----------+--------------------+------+---------+------------+--------------------+



### Split the Data for Spark ALS

In [186]:
# Split the ratings into training and test data
als_df_final = als_df_int_ids_cached.select(col("userIntId").alias("userId"), col("productIntId").alias("productId"), col("rating"), col('title'), col('parent_asin'))
als_df_final_cached = als_df_final.cache()

(training_data, test_data) = als_df_final_cached.randomSplit([0.7, 0.3], seed=42)
test_data_cached = test_data.cache()
training_data_cached = training_data.cache()

### Create Simple ALS Model (No Tuning)

In [187]:
# Create model without any hyperparameter tuning

# Set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(userCol="userId", itemCol="productId", ratingCol="rating", rank = 10, maxIter = 15, regParam = .1,
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)

# Fit the model to the training_data
model = als.fit(training_data_cached)

# Generate predictions on the test_data
test_predictions = model.transform(test_data_cached)
test_predictions_cached = test_predictions.cache()

# preview only the first 10 rows
test_predictions_cached.limit(10).show(truncate=False)

25/07/17 20:52:44 WARN TaskSetManager: Stage 30 contains a task of very large size (1361 KiB). The maximum recommended task size is 1000 KiB.

+------+---------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+----------+
|userId|productId|rating|title                                                                                                                                                                                                  |parent_asin|prediction|
+------+---------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+----------+
|471   |38208    |4     |Waterdrop DA29-00020B NSF 53&42 Certified Refrigerator Water Filter, Replacement for Samsung DA29-00020B, DA29-00020A, HAF-CIN/EXP, 46-9101, WDS-F27, 1 Filter                                         |B0BVFQY9HH |3.7422144 |
|410

                                                                                

#### Hyperparameter Tuning

In [188]:
# Takes a while with current data
# # Hyperparameter tuning
#
# # Use pyspark grid search
# param_grid = ParamGridBuilder() \
#            .addGrid(als.rank, [10, 50, 75, 100]) \
#            .addGrid(als.maxIter, [10]) \
#            .addGrid(als.regParam, [.05, .1, .15]) \
#            .build()
#
# # Create RMSE evaluator
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
#
# # Use cross validation
# cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, collectSubModels=True)
#
# # Checkpoint the training data to truncate its lineage.
# # This is a lazy operation, it will be triggered by the .fit() call.
# training_data_chkp = training_data_cached.checkpoint()
#
# # Fit the cross validator on the CHECKPOINTED DataFrame.
# model = cv.fit(training_data_chkp)
#
# # Best model
# best_model = model.bestModel
#
# # Average RMSE for each model
# avg_rmse_models = model.avgMetrics
#
# display(f"{len(param_grid)} models tested")

[Stage 1081:>                                                      (0 + 8) / 10]

KeyboardInterrupt: 

#### Best ALS Model

In [None]:
display(f"Best Model Rank: {best_model.rank}")
display(f"Best Model Params: {best_model.params}")

#### Get ALS Recommendations

In [104]:
# Get recommendations for user

# Generate top 10 movie recommendations for a specified set of users
# Pick user 21220 for now
users = als_df_final_cached.filter(col("userId") == 21220)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show(truncate=False)

rated_items = als_df_final_cached.filter(col("userId") == 21220).select("productId").rdd.flatMap(lambda x: x).collect()
filtered_recs = [rec for rec in userSubsetRecs.collect()[0]["recommendations"] if rec.productId not in rated_items]
filtered_recs_df = spark.createDataFrame(filtered_recs)

als_df_final_cached_mapping = als_df_final_cached.select('productId', 'parent_asin', 'title').dropDuplicates()
filtered_recs_df = filtered_recs_df.join(als_df_final_cached_mapping, 'productId', "left").sort("rating", ascending = False)
filtered_recs_df.show()

                                                                                

+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                              |
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|21220 |[{79921, 6.7020807}, {42789, 6.698961}, {86059, 6.668425}, {43595, 6.6322174}, {7816, 6.581626}, {79073, 6.560703}, {30237, 6.558498}, {42427, 6.556854}, {92458, 6.5219}, {81490, 6.520079}]|
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [130]:
# Find already rated items and remove
previously_rated_items = als_df_final_cached.filter(col("userId") == 21220).select("productId").rdd.flatMap(lambda x: x).collect()

# Find recommendations that are unreviewed by the user
ALS_filtered_recommendations = [rec for rec in userSubsetRecs.collect()[0]["recommendations"] if rec.productId not in previously_rated_items]
ALS_filtered_recommendations_df = spark.createDataFrame(ALS_filtered_recommendations)

# Add back meta data
als_df_final_cached_mapping = als_df_final_cached.select('productId', 'parent_asin', 'title').dropDuplicates()
als_df_final_cached_mapping_cached = als_df_final_cached_mapping.cache()
ALS_filtered_recommendations_df = ALS_filtered_recommendations_df.join(als_df_final_cached_mapping_cached, 'productId', "left").sort("rating", ascending = False)
ALS_filtered_recommendations_df_cached = ALS_filtered_recommendations_df.cache()
ALS_filtered_recommendations_df_cached.show()

                                                                                

+---------+------------------+-----------+--------------------+
|productId|            rating|parent_asin|               title|
+---------+------------------+-----------+--------------------+
|    79921| 6.702080726623535| B002XILY7K|BUNN REGFILTER 12...|
|    42789| 6.698960781097412| B003EAHXO4|Whirlpool 3300163...|
|    86059|   6.6684250831604| B00CWSZJD6|GE JT5500SFSS Ele...|
|    43595|6.6322174072265625| B00WMUJ1LU|PS1485610 - Repla...|
|     7816| 6.581625938415527| B0145SOZ5S|316557237 Range O...|
|    79073| 6.560702800750732| B06X9QJFSF|GE APPLIANCE PART...|
|    30237| 6.558497905731201| B00XN3DSBE|GENUINE Frigidair...|
|    42427| 6.556853771209717| B000ZIMP9G|Whirlpool Part Nu...|
|    92458| 6.521900177001953| B07L522PW2|Value 4 pack Kali...|
|    81490| 6.520079135894775| B08C4VMP74|Ecumfy 4681EA2002...|
+---------+------------------+-----------+--------------------+



In [132]:
# Save recommendations locally
ALS_filtered_recommendations_df_cached.write.parquet(r"output/ALS_item_recs.parquet", mode="overwrite")

In [189]:
spark.stop()

25/07/17 21:05:04 ERROR Instrumentation: org.apache.spark.SparkException: Job 126 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1301)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:1299)
	at scala.collection.mutable.HashSet$Node.foreach(HashSet.scala:450)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:376)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:1299)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:3234)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:85)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$stop$3(DAGScheduler.scala:3120)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1300)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:3120)
	at org.apache.spark.SparkContext.$anonfun

## Simple Hybrid Model

The purpose of this simple hybrid model is to use the recommendations from ALS (collaborative filtering) and rerank the items based off of the BERT similarity/distance score (content based recommender) to the top recommendation.

In [237]:
ALS_item_recommendations = pd.read_parquet("output/ALS_item_recs.parquet")
ALS_item_recommendations['rating'] = ALS_item_recommendations['rating'].clip(lower=0, upper=5)
print(ALS_item_recommendations)
# Find top item recommended
top_rec_item = ALS_item_recommendations.head(1)['parent_asin'][0]

   productId  rating parent_asin                                              title
0      79921    5.00  B002XILY7K  BUNN REGFILTER 12-Cup Regular Filters, Use wit...
1      42789    5.00  B003EAHXO4                           Whirlpool 33001634 Timer
2      86059    5.00  B00CWSZJD6            GE JT5500SFSS Electric Double Wall Oven
3      43595    5.00  B00WMUJ1LU  PS1485610 - Replacement Washer Washing Machine...
4       7816    5.00  B0145SOZ5S  316557237 Range Oven Control Board and Clock G...
5      79073    5.00  B06X9QJFSF  GE APPLIANCE PARTS Smooth KIT WR12X22148 GE Ap...
6      30237    5.00  B00XN3DSBE  GENUINE Frigidaire 241881318 Refrigerator Door...
7      42427    5.00  B000ZIMP9G       Whirlpool Part Number JEA7000ADW: CARTRIDGE-
8      92458    5.00  B07L522PW2  Value 4 pack Kalita Wave Filters, 185, Pack of...
9      81490    5.00  B08C4VMP74  Ecumfy 4681EA2002H Drain Pump Compatible with ...


In [162]:
# Find BERT's recommend items for the top recommendation
top_item_neighbors = predict_using_bert(top_rec_item)
top_item_neighbors.head()

Unnamed: 0,neighbor_id,neighbor_title,distance
2715,B0B8MFHWT2,279973 & 3392519 & 8577274 Dryer Thermal Cut-O...,0.25
2716,B09TT3WN9H,279838 Heating Element for Kenmore Dryer Compa...,0.26
2717,B078RF452M,MAYITOP Compatible Dryer Heating Element for S...,0.26
2718,B082PJ9BWH,"137114000 Dryer Heating Element Kit,137032600 ...",0.26
2719,B0BWCTGXFT,(2023 Update) 3392519 Dryer Thermal Fuse Repla...,0.27


In [167]:
bert_containing_ALS_item = ALS_item_recommendations['parent_asin'].isin(top_item_neighbors['neighbor_id'])

print("Are any items in the BERT recommender?")
print(bert_containing_ALS_item)

Are any items in the BERT recommender?
0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: parent_asin, dtype: bool


In [203]:
# # Hold off until we have new data/larger KNN?
#
# # Need to match on parent_asin
# top_item_neighbors = top_item_neighbors.rename(columns={'neighbor_id': 'parent_asin'})
#
# # Add in distance score and reranka
# ALS_item_recs_BERT_rerank = ALS_item_recommendations.join(top_item_neighbors, 'parent_asin', "left").sort("distance", ascending = False)

## Neural Network PyTorch?

In [241]:
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
import random
import warnings
import zipfile
from pathlib import Path

import pandas as pd
import tensorflow as tf
import tqdm
from libreco.algorithms import NCF
warnings.filterwarnings("ignore")