In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
from wordcloud import WordCloud
import openai
import spacy

# Load the Amazon product reviews dataset


In [None]:
reviews_df = pd.read_json('reviews.json', lines=True)
qa_df = pd.read_json('qa.json', lines=True)

# Combine the two datasets based on 'asin' (product ID)
merged_df = pd.merge(reviews_df, qa_df, on='asin', how='left')

# Data Preprocessing


In [None]:
# Drop duplicates to avoid multiple reviews from the same user for the same product
merged_df.drop_duplicates(subset=['reviewerID', 'asin', 'reviewTime', 'reviewText'], inplace=True)

# Fill missing values with empty string
merged_df.fillna('', inplace=True)

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

merged_df['CleanedText'] = merged_df['reviewText'].apply(text_preprocessing)

#Exploratory Data Analysis (EDA)

In [None]:
# Distribution of product ratings
plt.figure(figsize=(8, 6))
sns.countplot(merged_df['overall'])
plt.title('Distribution of Product Ratings')
plt.xlabel('Ratings')
plt.ylabel('Count')
plt.show()

# Distribution of review lengths
plt.figure(figsize=(8, 6))
sns.histplot(merged_df['CleanedText'].apply(len), bins=30)
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length')
plt.ylabel('Count')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(merged_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


#Sentiment Analysis using RoBERTa Model

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to get sentiment scores using the RoBERTa model
def get_sentiment_scores(text):
    encoded_text = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    output = model(**encoded_text)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    return scores_dict

# Compute sentiment scores for each review and store them in a DataFrame
res = {}
for i, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
    try:
        text = row['CleanedText']
        myid = row['reviewerID']
        sentiment_scores = get_sentiment_scores(text)
        res[myid] = sentiment_scores
    except RuntimeError:
        print(f'Broke for id {myid}')

results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'reviewerID'})
merged_df = pd.merge(merged_df, results_df, on='reviewerID', how='left')

#Collaborative Filtering using SVD++

In [None]:
# Keep only relevant columns for collaborative filtering
collab_df = merged_df[['reviewerID', 'asin', 'overall']]

# Define the rating scale (useful for Surprise library)
reader = Reader(rating_scale=(1, 5))

# Create the Surprise dataset
data = Dataset.load_from_df(collab_df, reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build the collaborative filtering model using SVD (Singular Value Decomposition)
collab_model = SVD()

# Train the collaborative filtering model on the training set
collab_model.fit(trainset)

# Make predictions on the test set
predictions = collab_model.test(testset)

#Content-Based Filtering using NLP-based Recommendations

In [None]:
nlp = spacy.load('en_core_web_lg')

# Preprocess product descriptions and question-answer data
product_data['text'] = product_data['summary'] + ' ' + product_data['reviewText'] + ' ' + product_data['question'] + ' ' + product_data['answer']
product_data.dropna(subset=['text'], inplace=True)

# Function to get top N similar products for a given product based on spaCy word vectors
def get_similar_products_spacy(product_id, N=5):
    idx = product_data[product_data['asin'] == product_id].index[0]
    product_data['similarity_score'] = product_data['text_vector'].apply(lambda x: x.similarity(product_data['text_vector'][idx]))
    similar_products = product_data.sort_values(by='similarity_score', ascending=False).head(N)
    return similar_products


#ChatGPT-based Recommendations using OpenAI API

In [None]:
def chatgpt_recommendations(prompt):
    openai.api_key = "YOUR_API_KEY"  # Replace with your GPT-3 API key
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=150,
        temperature=0.7,
        n=10,  # Number of recommendations to generate
        stop=["\n"]
    )
    recommendations = [r['choices'][0]['text'] for r in response['choices']]
    return recommendations

#Advanced Recommender System combining collaborative filtering, NLP-based recommendations, and ChatGPT-based recommendations

In [None]:
def advanced_recommender(user_id, user_query):
    # Collaborative Filtering Recommendations
    user_products = merged_df[merged_df['reviewerID'] == user_id]['asin'].unique()
    all_products = merged_df['asin'].unique()
    products_to_predict = [product for product in all_products if product not in user_products]
    user_predictions = [(user_id, product, collab_model.predict(user_id, product).est) for product in products_to_predict]
    user_predictions.sort(key=lambda x: x[2], reverse=True)
    top_collab_recommendations = [pid for uid, pid, score in user_predictions[:10]]

    # NLP-based Recommendations
    nlp_recommendations = nlp_based_recommendations(user_query)

    # ChatGPT-based Recommendations
    chatgpt_recommendations = chatgpt_recommendations(user_query)

    # Weight the recommendations based on confidence level
    weighted_recommendations = [(rec, 0.7) for rec in top_collab_recommendations] + [(rec, 0.8) for rec in nlp_recommendations] + [(rec, 1.0) for rec in chatgpt_recommendations]

    # Sort the recommendations based on confidence level and get the top 10
    weighted_recommendations.sort(key=lambda x: x[1], reverse=True)
    all_recommendations = [rec for rec, _ in weighted_recommendations[:10]]

    return all_recommendations

#User Interaction with User Queries

In [None]:
def enhanced_user_interaction():
    print("Welcome to the Enhanced Product Recommender System!")
    user_id = input("Please enter your user ID: ")
    user_query = input("Please enter your query: ")

    recommended_products = advanced_recommender(user_id, user_query)

    print("\nRecommended Products:")
    for idx, pid in enumerate(recommended_products, 1):
        product_name = product_data[product_data['asin'] == pid]['summary'].iloc[0]
        print(f"{idx}. Product ID: {pid}, Product Name: {product_name}")

#Evaluation Metrics

In [None]:
    def evaluate_recommender(predictions):
        # Ground truth ratings for the test set
        true_ratings = np.array([pred.r_ui for pred in predictions])

        # Predicted ratings by the recommender system
        predicted_ratings = np.array([pred.est for pred in predictions])

        # Mean Average Precision (MAP)
        map_score = average_precision_score(true_ratings > 3, predicted_ratings)

        # Discounted Cumulative Gain (DCG)
        dcg_score = dcg_score([true_ratings], [predicted_ratings])

        # Recall
        threshold = 3.5
        predicted_labels = (predicted_ratings > threshold).astype(int)
        true_labels = (true_ratings > threshold).astype(int)
        recall = recall_score(true_labels, predicted_labels)

        return map_score, dcg_score, recall

    # Evaluate the recommender system
    map_score, dcg_score, recall = evaluate_recommender(predictions)
    print(f"Mean Average Precision (MAP): {map_score:.4f}")
    print(f"Discounted Cumulative Gain (DCG): {dcg_score:.4f}")
    print(f"Recall: {recall:.4f}")

