# **Step 1: Initialize Spark**




First, set up your Spark environment to handle large-scale data processing:

Set Spark Settings to handle large data of about 20GB

In [1]:
pip install sparse

Collecting sparse
  Downloading sparse-0.15.1-py2.py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sparse
Successfully installed sparse-0.15.1


In [2]:
import time
import dask.array as da
import numpy as np
import sys
from dask.distributed import Client
from dask import compute
import sparse
import seaborn as sns
import matplotlib.pyplot as plt
import os
from datetime import date


In [3]:
from dask.distributed import Client
import pandas as pd


In [4]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=4617f0800909eff1ab66f955f044ee5e93533ba87a82981eceb1280d42abb26a
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Sentiment Detection") \
    .master("local[*]") \
    .config("spark.executor.memory", "10g") \
    .config("spark.driver.memory", "10g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "10g") \
    .config("spark.default.parallelism", "200") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

# **Step 2: Load the data**

Read the dataset from a file, which could be stored on your Google Drive:

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Adding schematic view while loading to speaden up the process rather than inferring a schema

In [19]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),  # Change to IntegerType if applicable
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", DateType(), True)
])

Loading the data and adding it to a data frame

In [21]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder \
    .appName("Load Multiple TSV Files") \
    .getOrCreate()

# List of file paths
data_paths = [
    '/content/drive/MyDrive/archive/amazon_reviews_us_Apparel_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Automotive_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Baby_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Beauty_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Books_v1_02.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Camera_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Electronics_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Furniture_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Sports_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Grocery_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv',
    '/content/drive/MyDrive/archive/amazon_reviews_us_Music_v1_00.tsv'
]

# Read and sample from each dataset
sampled_dfs = []
for path in data_paths:
    df = spark.read.option("treatEmptyValuesAsNulls", "true").option("sep", "\t").csv(path, schema=schema, sep='\t', header=True)
    sampled_df = df.limit(17000)  # Takes the first 17,000 rows
    sampled_dfs.append(sampled_df)

# Union all the sampled dataframes into one
df = sampled_dfs[0]
for dataframe in sampled_dfs[1:]:
    df = df.union(dataframe)

# Show some data
df.show()

# You can now work with 'final_df' which contains 17,000 rows from each file

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   32158956|R1KKOXHNI8MSXU|B01KL6O72Y|      24485154|Easy Tool Stainle...|         Apparel|          4|            0|          0|   N|                Y|★ THESE REALLY DO...|These Really Do W...| 2013-01-14|
|         US|    2714559|R26SP2OPDK4HT7|B01ID3ZS5W|     363128556|V28 Women Cowl Ne...|         Apparel|          5|    

# **Step 3: Data Preprocessing**

Implement steps to clean and preprocess the data, ensuring it is ready for analysis or machine learning.

In [22]:
from pyspark.sql.functions import col, lower, regexp_replace, concat_ws, udf, substring, row_number

# Remove duplicates
df = df.dropDuplicates()

# Handle missing values for both review_body and review_headline simultaneously
df = df.na.fill({
    "review_body": "No review text",
    "review_headline": "No review headline"
})

# Concatenate cleaned review headline and cleaned review text
df = df.withColumn("Full_Review", concat_ws(". ", "review_headline", "review_body"))

# Truncate full_text to the first 512 characters
df = df.withColumn("Full_Review", substring(col("Full_Review"), 1, 512))

df = df.drop("marketplace", "review_id", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_body","review_headline", "review_date", "product_parent","Full_Review")

df.cache()

DataFrame[customer_id: string, product_id: string, product_title: string, product_category: string, star_rating: int]

In [23]:
df = df.repartition(200)
df.cache()

DataFrame[customer_id: string, product_id: string, product_title: string, product_category: string, star_rating: int]

Converting the Dataset to a Panda's Dataframe

In [24]:
local_data = df.toPandas()

In [25]:
local_data.head()

Unnamed: 0,customer_id,product_id,product_title,product_category,star_rating
0,43638923,B0052SCU8U,AmazonBasics High Speed HDMI Cable,Electronics,4.0
1,43828045,B00A1EGOXM,AHB Chicago Pub Wall Table,Furniture,5.0
2,44326177,B007B5WHTE,Philips Bluetooth Soundbar Speaker with Subwoo...,Electronics,5.0
3,42024336,B00B25P27S,Emergency AM/FM/WX Crank Radio 20-576,Electronics,5.0
4,470490,B00VM5HIC0,Music for My Friends,Music,5.0


In [26]:
import pandas as pd

# Assuming 'df' is your DataFrame
# Convert columns to the appropriate types
local_data['customer_id'] = local_data['customer_id'].astype(int)

local_data['star_rating'] = local_data['star_rating'].astype(float)

# Show the first few rows to verify the changes
(local_data.head())

Unnamed: 0,customer_id,product_id,product_title,product_category,star_rating
0,43638923,B0052SCU8U,AmazonBasics High Speed HDMI Cable,Electronics,4.0
1,43828045,B00A1EGOXM,AHB Chicago Pub Wall Table,Furniture,5.0
2,44326177,B007B5WHTE,Philips Bluetooth Soundbar Speaker with Subwoo...,Electronics,5.0
3,42024336,B00B25P27S,Emergency AM/FM/WX Crank Radio 20-576,Electronics,5.0
4,470490,B00VM5HIC0,Music for My Friends,Music,5.0


In [66]:
local_data['product_id_encoded'], _ = pd.factorize(local_data['product_id'])

# Show the first few rows to verify the changes
print(local_data.head())

# Optionally, check how many unique product IDs you have encoded
print("Unique product IDs:", local_data['product_id_encoded'].nunique())

Unique product IDs: 134135


In [28]:

local_data['star_rating'] = local_data['star_rating'].fillna(0.0)


print(local_data['star_rating'].isnull().sum())

0


In [29]:
local_data.count()

customer_id           204000
product_id            204000
product_title         204000
product_category      203997
star_rating           204000
product_id_encoded    204000
dtype: int64

In [30]:
from datetime import datetime

# **FUNK SVD ALGORITHM**

In [83]:
#SVD Algorithm
import time
import dask.array as da
import numpy as np
import sys
from dask.distributed import Client
from dask import compute
import sparse
import seaborn as sns
import matplotlib.pyplot as plt
import os
from datetime import datetime

class FunkSVD:
  def __init__(self, client: Client):
    self.client = client

  def __preprocess_data(self, df, user_col, item_col, rating_col, chunk_size, n_factors):
    start_time = time.time()
    self.__print_status(0, 100, start_time, "Preprocessing data...")
    self.chunk_size = chunk_size
    self.n_factors = n_factors
    self.user_col = user_col
    self.item_col = item_col
    self.rating_col = rating_col
    self.train_df = df

    self.u_ids = df[user_col].unique()
    self.i_ids = df[item_col].unique()

    self.u_mapping = { x: i for i, x in enumerate(self.u_ids) }
    self.i_mapping = { x: i for i, x in enumerate(self.i_ids) }
    df['u_encodings'] = df[user_col].map(self.u_mapping)
    df['i_encodings'] = df[item_col].map(self.i_mapping)
    self.__print_status(50, 100, start_time, "Preprocessing data...")

    self.n_users = len(self.u_ids)
    self.n_items = len(self.i_ids)
    self.n_ratings = df.shape[0]

    self.min_rating = np.min(df[rating_col])
    self.max_rating = np.max(df[rating_col])
    self.mean_rating = np.mean(df[rating_col])

    df = df[["u_encodings", "i_encodings", rating_col]]
    self.df = df
    self.__print_status(100, 100, start_time, "Preprocessing data...")
    print()
    return df

  def __create_sparse_chunked_matrix(self, df):
    start_time = time.time()
    df_val = df.values
    sparse_df = sparse.COO(df_val[:, :2].T.astype(int), df_val[:, 2], shape=((self.n_users, self.n_items)))

    chunks = []
    for i in range(0, self.n_users, self.chunk_size):
      sub_chunks=[]
      self.__print_status(i + self.chunk_size, self.n_users, start_time, "Creating sparse-chunked matrix...")
      for j in range(0, self.n_items, self.chunk_size):
        sub_chunks.append(sparse_df[i: i + self.chunk_size, j: j + self.chunk_size])
      chunks.append(sub_chunks)

    self.__print_status(self.n_users, self.n_users, start_time, "Creating sparse-chunked matrix...")
    x = da.block(chunks)
    x_mask = da.sign(x).map_blocks(lambda x: x.todense(), dtype=np.ndarray) == 1
    print()

    return x, x_mask

  def __init_biases(self):
    u_biases = da.zeros((self.n_users, 1), chunks=(self.chunk_size,1))
    i_biases = da.zeros(self.n_items, chunks=(self.chunk_size,))
    return u_biases, i_biases

  def __init_latent_vectors(self):
    u_factors = da.random.normal(0, 0.1, (self.n_users, self.n_factors), chunks=(self.chunk_size, self.n_factors))
    i_factors = da.random.normal(0, 0.1, (self.n_items, self.n_factors), chunks=(self.chunk_size, self.n_factors))
    return u_factors, i_factors

  def __get_training_errors(self, error):
    mae = da.sum(da.absolute(error)) / self.n_ratings
    mse = da.sum(error ** 2) / self.n_ratings
    rmse = da.sqrt(mse)
    return (mae, mse, rmse)

  def __plot_training_errors(self, errors):
    if not os.path.exists('res/'):
        os.mkdir('res/')

    mapped_errors = {
      "MAE": [],
      "MSE": [],
      "RMSE": [],
    }

    for error in errors:
      mapped_errors["MAE"].append(error[0])
      mapped_errors["MSE"].append(error[1])
      mapped_errors["RMSE"].append(error[2])

    sns.set_style("darkgrid")
    start_time = time.time()
    plt.figure()
    plt.subplots(figsize=(30, 5))
    for index, (error, error_values) in enumerate(mapped_errors.items()):
      self.__print_status(index + 1, len(mapped_errors), start_time, "Ploting training errors...")
      plt.subplot(130 + index + 1)
      plt.xlabel("Epoch", fontsize=24)
      plt.ylabel(error, fontsize=24)
      plt.plot(error_values)

    plt.savefig("res/{}-{}-{}.pdf".format(type(self).__name__, "training-errors", datetime.today().strftime('%Y-%m-%d-%H:%M:%S')))

    print()

  def __mae(self, a, b):
    return (np.abs(np.subtract(a, b))).mean()

  def __mse(self, a, b):
    return (np.square(np.subtract(a, b))).mean()

  def __rmse(self, a, b):
    return np.sqrt(((np.subtract(a, b))**2).mean())

  def __print_status(self, iter, max_iter, start_time, status, step=False):
    elapsed_time = time.time() - start_time
    bar_length = 70
    j = iter / max_iter
    sys.stdout.write('\r')
    if step:
      sys.stdout.write(f"[{'=' * int(bar_length * j):{bar_length}s}] {int(100 * j)}% Elapsed time: {round(elapsed_time, 3)} s - {status} ({iter}/{max_iter})")
    else:
      sys.stdout.write(f"[{'=' * int(bar_length * j):{bar_length}s}] {int(100 * j)}% Elapsed time: {round(elapsed_time, 3)} s - {status}")
    sys.stdout.flush()

  def fit(self,
          n_factors,
          train_df,
          chunk_size,
          epochs=50,
          lr=0.001,
          reg=0.001,
          collect_errors=False,
          plot_errors=False,
          user_col="customer_id",
          item_col="product_id",
          rating_col="star_rating",
  ):
    fit_start_time = time.time()
    df = self.__preprocess_data(train_df, user_col, item_col, rating_col, chunk_size, n_factors)
    x, x_mask = self.__create_sparse_chunked_matrix(df)
    u_biases, i_biases = self.__init_biases()
    u_factors, i_factors = self.__init_latent_vectors()

    start_time_epoch = time.time()
    train_errors = []
    for epoch in range(epochs):
      self.__print_status(epoch + 1, epochs, start_time_epoch, "Creating epochs", step=True)

      pred = self.mean_rating + u_biases + u_factors @ i_factors.T + i_biases
      error = x - pred * x_mask

      u_biases = u_biases + lr * da.sum(error - reg * u_biases, axis=1, keepdims=True)
      i_biases = i_biases + lr * da.sum(error - reg * i_biases, axis=0, keepdims=True)

      u_factors = u_factors + lr * (error @ i_factors - reg * u_factors)
      i_factors = i_factors + lr * ((u_factors.T @ error).T - reg * i_factors)

      if collect_errors:
        train_errors.append(self.__get_training_errors(error))

    print("\nComputing in parallel...")
    compute_start_time = time.time()
    self.u_biases, self.i_biases, self.u_factors, self.i_factors, self.train_errors = compute(
      u_biases,
      i_biases,
      u_factors,
      i_factors,
      train_errors
    )

    self.u_biases = self.u_biases.T
    compute_end_time = time.time()

    print("Compute parallel time: {} s".format(round(compute_end_time - compute_start_time, 3)))
    print("Compute parallel time per epoch: {} s".format(round((compute_end_time - compute_start_time) / epochs, 3)))
    print("Total fitting time: {} s".format(round(compute_end_time - fit_start_time, 3)))

    if collect_errors and plot_errors:
      self.__plot_training_errors(self.train_errors)

  def predict(self, test_df, user_col=None, item_col=None):
    if user_col is None: user_col = self.user_col
    if item_col is None: item_col = self.item_col

    predictions = []
    start_time = time.time()
    df = test_df[[user_col, item_col]].values
    df_len = len(df)

    for i in range(df_len):
      user, item = df[i][0], df[i][1]
      self.__print_status(i, df_len, start_time, "Predicting...")
      pred = self.mean_rating

      if user in self.u_mapping and item in self.i_mapping:
        u_id = self.u_mapping[user]
        i_id = self.i_mapping[item]

        pred += self.u_biases[0][u_id] + self.i_biases[0][i_id] + self.u_factors[u_id] @ self.i_factors[i_id]
        pred = min(max(self.min_rating, pred), self.max_rating)

      predictions.append(pred)
    print()

    return predictions

  def eval(self, ground_truths, predictions):
    mae = self.__mae(ground_truths, predictions)
    mse = self.__mse(ground_truths, predictions)
    rmse = self.__rmse(ground_truths, predictions)
    return mae, mse, rmse

**Training the Model**

In [None]:
from dask.distributed import Client
import pandas as pd

train = local_data.sample(frac=0.7)
test = local_data.drop(train.index.tolist())

client = Client(n_workers=2)
model = FunkSVD(client)
model.fit(
        n_factors=30,
        train_df=train,
        epochs=10,
        chunk_size=5000,
        collect_errors=True,
        plot_errors=False
    )
predictions = model.predict(test)

**Plots for MAE,MSE AND RMSE**

In [85]:
gt = test["star_rating"].to_numpy()

eval = model.eval(gt, predictions)
print(eval)



(0.9880305546005825, 1.5913406404976782, 1.2614835078183457)


In [173]:
print(f"MAE: {eval[0]}")
print(f"MSE: {eval[1]}")
print(f"RMSE: {eval[2]}")

MAE: 0.9880305546005825
MSE: 1.5913406404976782
RMSE: 1.2614835078183457


In [86]:
client.shutdown()

INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:44097'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:37565'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:52042; closing.
INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:46535', name: 0, status: closing, memory: 0, processing: 0> (stimulus_id='handle-worker-cleanup-1714888584.7778902')
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:52030; closing.
INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:36907', name: 1, status: closing, memory: 0, processing: 0> (stimulus_id='handle-worker-cleanup-1714888584.784753')
INFO:distributed.scheduler:Lost all workers
INFO:distributed.scheduler:Scheduler closing due to unknown reason...
INFO:distributed.scheduler:Sc

# **Product Recommendation**

In [202]:
def recommend_products(model, user_id, N=10):
    # Check if user_id exists in the mapping
    if user_id not in model.u_mapping:
        return "User ID does not exist in the training data."

    user_index = model.u_mapping[user_id]

    # Compute scores using dot product of user factors with all item factors
    user_factors = model.u_factors[user_index]
    scores = user_factors @ model.i_factors.T + model.i_biases[0]

    # Get the top N items with the highest scores
    top_items_indices = np.argsort(-scores)[:N]  # Get indices of top scores

    # Map back to item IDs
    top_items_ids = [model.i_ids[idx] for idx in top_items_indices]

    return top_items_ids


In [226]:
# Example usage
user_id = 41959978 # example user ID from your dataset
top_products = recommend_products(model, user_id, N=20)
print("Top 20 recommended products for user {}: {}".format(user_id, top_products))

Top 20 recommended products for user 41959978: ['B00HES9CMS', 'B0006VJ6TO', 'B00P6TUO5G', 'B003L1ZYYM', 'B011HWM106', 'B001QAZARG', 'B0012P30OK', 'B00JHLJSME', 'B00CBTCDEE', 'B0016BFR4G', 'B001CCAISE', 'B0046I3I8I', 'B00OYRW4UE', 'B00OTXUNYW', 'B003FXXOK2', 'B004UGMW1K', 'B004O25RJ4', 'B00D1KBG5Y', 'B00VQLAT2Q', 'B0050R67U0']


In [227]:
# Sample recommended product IDs
recommended_product_ids = top_products

# Convert to DataFrame for easier merging
recommended_df = pd.DataFrame(recommended_product_ids, columns=['product_id'])


In [228]:
results = pd.merge(recommended_df, local_data[['product_id', 'product_title']], on='product_id', how='left').drop_duplicates()
results.head(10)

Unnamed: 0,product_id,product_title
0,B00HES9CMS,Viva Naturals #1 Best Selling Certified Organi...
849,B0006VJ6TO,Body Back Company’s Body Back Buddy Trigger Po...
1471,B00P6TUO5G,"Viva Naturals Organic Non-GMO Cacao Powder, 2 ..."
1618,B003L1ZYYM,AmazonBasics High-Speed HDMI Cable - 6.5 Feet ...
1714,B011HWM106,Women's Vogue Shoulder Off Wide Hem Design Top...
1715,B001QAZARG,B Is For Bob [ECOPAK]
1716,B0012P30OK,Fisher-Price Rock-A-Stack - Pink
1717,B00JHLJSME,Adidas Chelsea Home Youth Jersey [CHEBLU/CORBL...
1718,B00CBTCDEE,Scarlet Red Satin Sash Set
1720,B0016BFR4G,Uncle Lee's Organic Green Tea -- 100 Tea Bags ...


# **Sentiment Analysis for Recommendations**

In [53]:
!pip install pandas pyarrow




In [235]:
import pandas as pd

# Replace 'path_to_file' with the actual path within your Google Drive
file_path = '/content/drive/My Drive/Big Data Project/senti.parquet'

# Read the Parquet file
df_sentiment= pd.read_parquet(file_path)

# Display the first few rows of the DataFrame
df_sentiment.head(10)


Unnamed: 0,customer_id,product_id,product_parent,product_title,product_category,star_rating,full_text,language,sentiment
0,32035145,B00CO2UY6C,546440076,Safavieh Lyndhurst Collection LNH214A Traditio...,Furniture,5.0,Five Stars. Perfect!,en,4
1,9213870,B00WE2W2A8,319585048,New Wayzon Mini Clip Metal Screen MP3 Music Me...,Electronics,1.0,This item just last for oneday.. This item jus...,en,0
2,12190192,B00NEZ6OW6,721363676,Polaroid Cube HD 1080p Lifestyle Action Video ...,Camera,2.0,5 minutes videos Max.. Although the camera is ...,en,1
3,4176674,B00GJQN89O,934898207,2 PINK Droplet Latex Free Blender Sponges Liqu...,Beauty,1.0,One Star. So disappointed on them😭,en,0
4,22633251,B00AYZB6Z4,551463410,"40 Inch LED Light Bar DR 14,400 Lumens",Automotive,3.0,"deal for the money, but you'll have do some wo...",en,2
5,5235011,B0013RD4FC,158658119,MTM Predator Shooting Rest,Sports,5.0,Great buy. Outstanding value.,en,4
6,14455407,B00RC7GPS0,994525813,GuruNanda Ghee - Clarified butter- 14 Oz (Pack...,Grocery,5.0,Good Ghee!. Wonderful! It's so nice not to ha...,en,4
7,50881246,1591161304,931591751,"Ranma 1/2, Vol. 8",Books,4.0,The cute fiancee.... Ever since arriving at To...,en,3
8,51159438,B00DW475M2,399347886,Zeimax Q38i Super Bass Inear Headphone Earphon...,Electronics,1.0,"Not only did they sound terrible, but they are...",en,0
9,43207259,B0001XARA4,951469761,Motown #1's,Music,5.0,Five Stars. I was very happy with my purchase ...,en,4


In [220]:
# Group by 'product_id' and calculate the mean sentiment
average_sentiment = df_sentiment.groupby('product_id')['sentiment'].mean().reset_index()

# Rename columns for clarity
average_sentiment.columns = ['product_id', 'avg_sentiment']

# Display the resulting DataFrame
average_sentiment.head()


Unnamed: 0,product_id,avg_sentiment
0,2214407,3.0
1,2231344,3.0
2,5995043,3.0
3,6379702,3.0
4,6483895,3.0


In [233]:
merged_df = pd.merge(results, average_sentiment, on='product_id', how='left')

# Filter out products where the average sentiment is greater than 3.5
filtered_recommendations = merged_df[merged_df['avg_sentiment'] >=3.5]




In [234]:
# Display the filtered recommendations
print(f"User {user_id} may like the following products as well:")

# Initialize a counter starting from 1
counter = 1

# Loop through each row in the DataFrame and print the product details
for _, row in filtered_recommendations.iterrows():
    print(f"{counter}. {row['product_title']} ")
    counter += 1  # Increment the counter for each product


User 41959978 may like the following products as well:
1. Viva Naturals #1 Best Selling Certified Organic Cacao Powder from Superior Criollo Beans, 1 LB Bag 
2. Body Back Company’s Body Back Buddy Trigger Point Therapy Self Massage Tool - PARENT 
3. Viva Naturals Organic Non-GMO Cacao Powder, 2 Pound Bag 
4. AmazonBasics High-Speed HDMI Cable - 6.5 Feet (2 Meters) Supports Ethernet, 3D, 4K and Audio Return 
5. Women's Vogue Shoulder Off Wide Hem Design Top Shirt 
6. B Is For Bob [ECOPAK] 
7. Fisher-Price Rock-A-Stack - Pink 
8. Adidas Chelsea Home Youth Jersey [CHEBLU/CORBLU/WHITE] (S) 
9. Scarlet Red Satin Sash Set 
10. Uncle Lee's Organic Green Tea -- 100 Tea Bags net wt 5.64 oz (160g) 
11. Canon Speedlite 430EX II Flash for Canon Digital SLR Cameras 
12. Elite Sportz Exercise Sliders are Double Sided and Work Smoothly on Any Surface. Wide Variety of Low Impact Exercise’s You Can Do. Full Body Workout, Compact for Travel or Home Ab Workout 
13. Britax Frontier G1.1 Clicktight Harness

In [236]:
!apt-get update
!apt-get install texlive texlive-xetex texlive-latex-extra pandoc
!pip install pypandoc

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.125.190.39)] [1 InRele0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.125.190.39)] [Connecti                                                                                                    Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [830 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadconten

In [239]:
!jupyter nbconvert --output-dir='/content' --to latex  '/content/drive/My Drive/Colab Notebooks/Final_Recommender_System_SVD_MAIN.ipynb'

[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/Final_Recommender_System_SVD_MAIN.ipynb to latex
[NbConvertApp] Support files will be in Final_Recommender_System_SVD_MAIN_files/
[NbConvertApp] Making directory /content/Final_Recommender_System_SVD_MAIN_files
[NbConvertApp] Writing 157230 bytes to /content/Final_Recommender_System_SVD_MAIN.tex


In [240]:
!buf_size=1000000 xelatex --interaction=nonstopmode 'Final_Recommender_System_SVD_MAIN.tex'

This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2022/dev/Debian) (preloaded format=xelatex)
 restricted \write18 enabled.
entering extended mode
(./Final_Recommender_System_SVD_MAIN.tex
LaTeX2e <2021-11-15> patch level 1
L3 programming layer <2022-01-21>
(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
(/usr/share/texlive/texmf-dist/tex/latex/base/size11.clo))
(/usr/share/texlive/texmf-dist/tex/latex/tcolorbox/tcolorbox.sty
(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty
(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty
(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex
(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common-lists.t
ex)) (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
) (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex
(/usr/share/texlive/texmf-dist/tex/generi