In [1]:
!pip install anthropic pandas -q
import os
import pandas as pd
from anthropic import Anthropic
import time
from google.colab import files
from getpass import getpass

print("Upload amazon_product_reviews.csv file:")
uploaded_reviews = files.upload()
print("\nGet your API key from: https://console.anthropic.com/settings/keys")
api_key = getpass("Enter your Anthropic API key: ")
client = Anthropic(api_key=api_key)

def clean_with_claude(review_text):
    prompt = f"""Clean this Amazon product review:
Amazon Product Review: {review_text}
Cleaning Tasks:
1. Fix all spelling and grammar errors
2. Standardize capitalization
3. Remove emojis and excessive punctuation
4. Fix common abbreviations
5. Keep the same meaning and sentiment
Return only the cleaned review text and nothing else."""
    try:
        message = client.messages.create(
            model="claude-sonnet-4-20250514", max_tokens=500,
            messages=[{"role": "user", "content": prompt}])
        cleaned = message.content[0].text.strip()
        return cleaned
    except Exception as e:
        print(f"Error: {e}")
        return review_text

print("\nLoading dataset...")
df = pd.read_csv(list(uploaded_reviews.keys())[0])
print(f"Dataset loaded: {len(df)} total reviews")
print(f"Columns: {df.columns.tolist()}\n")
review_column = 'reviews.text'

df = df.dropna(subset=[review_column])
df = df[df[review_column].str.len() > 20]
df = df[df[review_column].str.len() < 1000]
print(f"After filtering: {len(df)} reviews\n")

sample_size = 100
df_sample = df.sample(n=min(sample_size, len(df)), random_state=42)
results = []

for idx, row in df_sample.iterrows():
    original_review = str(row[review_column])
    cleaned_review = clean_with_claude(original_review)
    results.append({
        'original_review': original_review,
        'cleaned_review': cleaned_review,
        'rating': row.get('reviews.rating', None),
        'original_length': len(original_review),
        'cleaned_length': len(cleaned_review)})
    if len(results) <= 5:
        rating = row.get('reviews.rating', 'N/A')
        print(f"\nExample {len(results)} (Rating: {rating}):")
        print(f"BEFORE: {original_review[:200]}")
        print(f"AFTER:  {cleaned_review[:200]}")
        print("-"*70)
    if len(results) % 20 == 0:
        print(f"\nProcessed {len(results)}/{sample_size} reviews...")
    time.sleep(0.5)

df_results = pd.DataFrame(results)
print(f"\nTotal reviews cleaned: {len(df_results)}")
print(f"Average original length: {df_results['original_length'].mean():.0f} characters")
print(f"Average cleaned length: {df_results['cleaned_length'].mean():.0f} characters")

avg_change = ((df_results['cleaned_length'].mean() - df_results['original_length'].mean())
              / df_results['original_length'].mean() * 100)
print(f"Average length change: {avg_change:+.1f}%")

input_tokens = df_results['original_length'].sum() / 4
output_tokens = df_results['cleaned_length'].sum() / 4
estimated_cost = (input_tokens / 1_000_000) * 3 + (output_tokens / 1_000_000) * 15
print(f"\nEstimated cost: ${estimated_cost:.2f}")

print("\nRandom sample comparison:")
samples = df_results.sample(n=min(3, len(df_results)))
for idx, row in samples.iterrows():
    print(f"\nRating: {row.get('rating', 'N/A')}")
    print(f"BEFORE:\n{row['original_review']}\n")
    print(f"AFTER:\n{row['cleaned_review']}")
    print("-"*70)

cleaned_reviews_file = 'amazon_reviews_cleaned.csv'
df_results.to_csv(cleaned_reviews_file, index=False)
print(f"\nResults saved to: {cleaned_reviews_file}")
files.download(cleaned_reviews_file)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/337.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hUpload amazon_product_reviews.csv file:


Saving amazon_product_reviews.csv to amazon_product_reviews.csv

Get your API key from: https://console.anthropic.com/settings/keys
Enter your Anthropic API key: ··········

Loading dataset...
Dataset loaded: 1597 total reviews
Columns: ['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded', 'dateUpdated', 'dimension', 'ean', 'keys', 'manufacturer', 'manufacturerNumber', 'name', 'prices', 'reviews.date', 'reviews.doRecommend', 'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title', 'reviews.userCity', 'reviews.userProvince', 'reviews.username', 'sizes', 'upc', 'weight']

After filtering: 954 reviews


Example 1 (Rating: nan):
BEFORE: I like the device. I had the Apple TV. THE Amazon is better than apple and Roku. The only network Amazon needs is the WWE Network app. There is a lot of WWE fans out there with Amazon that would love 
AFTER:  I like the device. I had the Apple TV. The Amazon is better than Apple and Roku. The only network Amazon

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>