# AI Analyzer of Customer Reviews

This example illustrates core function of analyzing customer reviews using AI and generating a response when needed.

This example illustrate the following:
- Determine the sentiment of a review (positive, neutral, negative).
- For negative reviews, extract information from the review to classify the cause (common themes and keywords).
- Identify whether a response is required back to the customer.
- Generate a response mentioning alternative products that may satisfy the customer.
- Control the accuracy of sentiment analysis. Target >90%.
- Visualize insights.



In [None]:
# Importing the libraries
!pip install pandas openai matplotlib wordcloud

In [None]:
# Import necessary libraries
import pandas as pd
import gzip
import json
import requests
from wordcloud import WordCloud
from openai import OpenAI
import matplotlib.pyplot as plt
import os

# Constants
SAMPLE_SIZE = 2 # Number of products (parent_asin) to sample
MIN_NUM_OF_REVIEWS = 10 # minimum number of reviews for a product to be considered

# Set OpenAI API key
openai_api_key = os.environ.get("OPENAI_API_KEY")

# Dataset URL and local file path
review_dataset_url = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/Amazon_Fashion.jsonl.gz"
metadata_dataset_url = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Amazon_Fashion.jsonl.gz"
review_local_file_path = "Amazon_Fashion.jsonl.gz"
metadata_local_file_path = "meta_Amazon_Fashion.jsonl.gz"

print('----------------------------------------------------')
print('Download and Load Data')
print('----------------------------------------------------')

def download_dataset(url, local_file):
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(local_file, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    print("Download complete.")

def load_data(file_path):
    print("Loading the dataset...")
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    print("Loading complete.")
    return pd.DataFrame(data)

# Check if the reviews file already exists other wise download
if not os.path.exists(review_local_file_path):
    download_dataset(review_dataset_url, review_local_file_path)
else:
    print("Review file already exists. Skipping download.")

# Load the data
df = load_data(review_local_file_path)
print(f"Loaded reviews dataset with {len(df)} rows.")


# Check if the metadata file already exists other wise download
if not os.path.exists(metadata_local_file_path):
    download_dataset(metadata_dataset_url, metadata_local_file_path)
else:
    print("Metadata file already exists. Skipping download.")

# load metadata
df_metadata = load_data(metadata_local_file_path)
print(f"Loaded metadata dataset with {len(df_metadata)} rows.")


In [None]:
print('----------------------------------------------------')
print('Prepare sampled dataset')
print('----------------------------------------------------')

# Create a new column 'review' that combines 'rating', 'title', and 'text'
df.loc[:, 'review'] = df.apply(lambda row: f"Rating: {row['rating']}; Title: {row['title']}; Text: {row['text']}", axis=1)

# Drop unnecessary columns
df1 = df[['review', 'parent_asin']]

# keep only the columns we need
df_metadata = df_metadata[['parent_asin', 'store', 'title']].drop_duplicates()

# merge the metadata with the reviews based on parent_asin
df1 = df1.merge(df_metadata, left_on='parent_asin', right_on='parent_asin', how='inner')

# Group by 'parent_asin' and calculate the quantity per each
selected_asins = df1.groupby('parent_asin').size().reset_index(name='review_qty')

# Filter out rows where quantity is less than 10
selected_asins = selected_asins[selected_asins['review_qty'] > MIN_NUM_OF_REVIEWS].sample(n=SAMPLE_SIZE, random_state=1)

# Filter the original dataframe to include only records with parent_asin in selected_asins
df_sampled = df1[df1['parent_asin'].isin(selected_asins['parent_asin'])]

# Drop unnecessary columns
df_sampled = df_sampled[['parent_asin', 'store', 'review', 'title']]
display(df_sampled.head())


In [None]:
print('----------------------------------------------------')
print("Analyze sentiment using OpenAI API...")
print('----------------------------------------------------')

# Initialize the OpenAI client
client = OpenAI(
    api_key=openai_api_key
)

def analyze_sentiment(text):
    prompt = '''
    Analyze the customer review based on the following three criteria: 
    - sentiment: could be 'Positive', 'Neutral', or 'Negative'.
    - theme: generalize key words from the review.
    - response: only for negative reviews write a response to the customer. Offer free shipping as needed. For extreme cases offer 5%% discount coupon for the next purchase in the store.

    Write output as a JSON formatted string.

    User Review: 
    '''
    messages = [
        {"role": "system", "content": "You are a helpful assistant in the fashion online store."},
        {"role": "user", "content": f"{prompt} \"{text}\""}
    ]
    response = client.chat.completions.create(
        model="gpt-3o-mini",
        messages=messages,
        max_tokens=100,  # Limit the response to the classification only
        temperature=0,  # Make the response deterministic
        response_format={ "type": "json_object" }
    )
    print('.', end='')
    return response.choices[0].message.content.strip()
    

# Apply sentiment analysis to the first 100 reviews
print(f"Analyzing sentiment for {SAMPLE_SIZE} items...")

ai_response = df_sampled['review'].apply(lambda x: analyze_sentiment(x))

print("\nSentiment analysis complete.")


def safe_json_loads(x):
	try:
		return json.loads(x)
	except json.JSONDecodeError:
		return {}

df_sampled['sentiment'] = ai_response.apply(lambda x: safe_json_loads(x).get('sentiment', 'Unknown'))
df_sampled['theme'] = ai_response.apply(lambda x: safe_json_loads(x).get('theme', 'Unknown'))
df_sampled['response'] = ai_response.apply(lambda x: safe_json_loads(x).get('response', 'Unknown'))

# Display the first few rows with sentiment
df_sampled.head()


In [None]:
print('----------------------------------------------------')
print("Generating visualizations...")
print('----------------------------------------------------')

# Sentiment distribution
plt.figure(figsize=(8, 5))
df_sampled['sentiment'].value_counts().plot(kind='pie', wedgeprops=dict(width=0.6))
plt.title(f"Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

df_positive = df_sampled[df_sampled['sentiment'] == 'Positive']
df_negative = df_sampled[df_sampled['sentiment'] == 'Negative']

# Word cloud for common positive themes
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(
    " ".join([keyword for keywords in df_positive['theme'] for keyword in keywords])
)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(f"Common Themes in Positive Reviews")
plt.show()

# Word cloud for common negative themes
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(
    " ".join([keyword for keywords in df_negative['theme'] for keyword in keywords])
)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(f"Common Themes in Negative Reviews")
plt.show()


print('Negative Reviews:')
df_negative[['title', 'review', 'response']]

In [None]:
print('Review: ', df_negative.iloc[0]['review'])
print('Response: ', df_negative.iloc[0]['response'])