# Work Done by: [Nicholas Tan Qin Sheng] and [Muhammad Hafiz Bin Abdul Halim]

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, ndcg_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [18]:
# Download required NLTK data (if you haven't already)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# 1. Load and Prepare Data
def load_and_prepare_data(filepath, sample_size=None):
    """Loads JSONL data, preprocesses review text, and adds a sample size for testing."""
    df = pd.read_json(filepath, lines=True)

    if sample_size:
        df = df.sample(sample_size, random_state=42) # For reproducibility
    df = df[['asin', 'text', 'rating']] # Select relevant columns
    df.rename(columns={'text': 'review_text'}, inplace=True)  # Rename columns
    df.dropna(subset=['review_text'], inplace=True) # Drop rows with NaN review text
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HexMa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HexMa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HexMa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# 2. Preprocess Review Text
def preprocess_text(text):
    """Cleans and lemmatizes the review text."""
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE) # Remove punctuation
    text = text.lower() # Lowercase
    stop_words = set(stopwords.words('english')) # Stop words
    text = [w for w in text.split() if not w in stop_words]
    lemmatizer = WordNetLemmatizer() # Lemmatize
    text = [lemmatizer.lemmatize(token) for token in text]
    text = " ".join(text)
    return text

In [21]:
# 3. Feature Engineering
def feature_engineering(df):
    """Creates TF-IDF vectors from review text and adds a simple sentiment score."""
    df['cleaned_review_text'] = df['review_text'].apply(preprocess_text) # Clean text

    vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary size
    tfidf_matrix = vectorizer.fit_transform(df['cleaned_review_text'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=df.index)  # TF-IDF to DataFrame

    df['sentiment'] = df['rating'].apply(lambda x: 1 if x > 3 else (-1 if x < 3 else 0))  # Simplified sentiment
    return df, tfidf_df

In [8]:
# 4. Build Content-Based Recommendation System
def create_recommendation_system(df, tfidf_df):
    """Calculates cosine similarity and recommends products based on content."""
    cosine_sim = cosine_similarity(tfidf_df, tfidf_df)
    return cosine_sim

def recommend_products(asin, df, cosine_sim, top_n=5):
    """Recommends similar products based on a given ASIN."""
    try:
        idx = df[df['asin'] == asin].index[0] # Get index of product
    except IndexError:
        return f"Product '{asin}' not found."
    sim_scores = list(enumerate(cosine_sim[idx])) # Similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Sort
    sim_scores = sim_scores[1:top_n+1] # Get top N similar
    product_indices = [i[0] for i in sim_scores] # Product indices
    return df['asin'].iloc[product_indices].tolist() # Return ASINs

In [None]:
# 5. Evaluate the Recommendation System (Simplified)
def evaluate_recommendation_system(df, cosine_sim, test_size=0.2):
  """Splits data, makes recommendations, and evaluates using basic metrics."""
  train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

  def get_relevant_asins(asin, train_df, cosine_sim, top_n=5):
    """Gets a list of relevant ASINs for a given ASIN."""
    try:
        idx = train_df[train_df['asin'] == asin].index[0]
    except IndexError:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    product_indices = [i[0] for i in sim_scores]
    return train_df['asin'].iloc[product_indices].tolist()

In [None]:
# 5. Evaluate the Recommendation System (Simplified)
def evaluate_recommendation_system(df, cosine_sim, test_size=0.2):
  """Splits data, makes recommendations, and evaluates using basic metrics."""
  train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

  def get_relevant_asins(asin, df, train_df, cosine_sim, top_n=5):
    """Gets a list of relevant ASINs for a given ASIN."""
    try:
        idx = df[df['asin'] == asin].index[0]
    except IndexError:
        return []
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filter similar products to only those in the training set
    recommended_asins = []
    for i, _ in sim_scores[1:]:
        asin_i = df['asin'].iloc[i]
        if asin_i in train_df['asin'].values:
            recommended_asins.append(asin_i)
        if len(recommended_asins) >= top_n:
            break

    return recommended_asins

  # Make recommendations for test set
  recommendations[asin] = get_relevant_asins(asin, df, train_df, cosine_sim)

  # Evaluate using simple metrics
  precision_list = []
  recall_list = []

  for asin, recommended_asins in recommendations.items():
    # Try to find the rating in train_df
    try:
        test_index = test_df[test_df['asin'] == asin].index[0]
        rating_value = test_df['rating'].iloc[test_index]
    except IndexError:
        rating_value = 0
    # Look if similar products in recommendations can also be found on rating
    asin_with_rating = train_df[train_df['rating'] == rating_value]['asin'].tolist()

    relevant_count = len(set(recommended_asins) & set(asin_with_rating)) # Find with recommendation and rating
    if len(recommended_asins) > 0:
        precision_list.append(relevant_count / len(recommended_asins))
    else:
        precision_list.append(0.0)  # Avoid division by zero

    if len(asin_with_rating) > 0:
        recall_list.append(relevant_count / len(asin_with_rating))
    else:
        recall_list.append(0.0)  # Avoid division by zero

  # Handle empty lists
  precision = sum(precision_list) / len(precision_list) if precision_list else 0.0
  recall = sum(recall_list) / len(recall_list) if recall_list else 0.0

  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")

In [None]:
filepath = 'data/Video_Games_with_sentiment.jsonl'  # Replace with your actual file path
sample_size = 5000  # Adjust for testing (remove for full dataset)

# Load, prepare, and engineer features
df = load_and_prepare_data(filepath, sample_size)


KeyError: 'text'

In [None]:
df, tfidf_df = feature_engineering(df)

IndexError: index 3695993 is out of bounds for axis 0 with size 5000

In [27]:
# Create and evaluate the recommendation system
cosine_sim = create_recommendation_system(df, tfidf_df)
evaluate_recommendation_system(df, cosine_sim)

# Make recommendations for a specific product
asin_to_recommend = 'B00005B158'  # Example ASIN (replace with a real one)
recommendations = recommend_products(asin_to_recommend, df, cosine_sim)

if isinstance(recommendations, str):  # Error message
    print(recommendations)
else:
    print(f"Recommended products for {asin_to_recommend}: {recommendations}")

TypeError: evaluate_recommendation_system.<locals>.get_relevant_asins() missing 1 required positional argument: 'cosine_sim'