In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Let's scrape reviews for a book from 'books.toscrape.com'
# NOTE: Real e-commerce sites are much harder to scrape. This is for learning.
base_url = 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/reviews.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all review blocks
reviews = soup.find_all('article', class_='product_pod') # This class is for products, but we'll use it as an example. 
# On a real review page, you'd inspect the HTML to find the correct tags and classes for reviews.

# For this example, let's create some dummy data as if we scraped it
review_data = [
    {'rating': 5, 'text': 'This is the best book I have ever read! Absolutely captivating.'},
    {'rating': 4, 'text': 'A really great read, I enjoyed it a lot and would recommend.'},
    {'rating': 3, 'text': 'It was an okay book, not bad but not memorable either.'},
    {'rating': 2, 'text': 'I was disappointed. The plot was weak and the characters were flat.'},
    {'rating': 1, 'text': 'Terrible. I could not finish it. A complete waste of time and money.'}
]

df = pd.DataFrame(review_data)
df.to_csv('data/scraped_reviews.csv', index=False)

print("Dummy data saved to data/scraped_reviews.csv")

Dummy data saved to data/scraped_reviews.csv


In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Load the data
df = pd.read_csv('data/scraped_reviews.csv')

# 1. Create sentiment labels from ratings
def get_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['rating'].apply(get_sentiment)

# 2. Clean the text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Lemmatize and remove stopwords
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

print(df.head())

   rating                                               text sentiment  \
0       5  This is the best book I have ever read! Absolu...  Positive   
1       4  A really great read, I enjoyed it a lot and wo...  Positive   
2       3  It was an okay book, not bad but not memorable...   Neutral   
3       2  I was disappointed. The plot was weak and the ...  Negative   
4       1  Terrible. I could not finish it. A complete wa...  Negative   

                                      cleaned_text  
0       best book ever read absolutely captivating  
1    really great read enjoyed lot would recommend  
2                   okay book bad memorable either  
3            disappointed plot weak character flat  
4  terrible could finish complete waste time money  


In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Load the data
df = pd.read_csv('data/scraped_reviews.csv')

# 1. Create sentiment labels from ratings
def get_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['rating'].apply(get_sentiment)

# 2. Clean the text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Lemmatize and remove stopwords
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

print(df.head())

   rating                                               text sentiment  \
0       5  This is the best book I have ever read! Absolu...  Positive   
1       4  A really great read, I enjoyed it a lot and wo...  Positive   
2       3  It was an okay book, not bad but not memorable...   Neutral   
3       2  I was disappointed. The plot was weak and the ...  Negative   
4       1  Terrible. I could not finish it. A complete wa...  Negative   

                                      cleaned_text  
0       best book ever read absolutely captivating  
1    really great read enjoyed lot would recommend  
2                   okay book bad memorable either  
3            disappointed plot weak character flat  
4  terrible could finish complete waste time money  


In [4]:
from transformers import pipeline

# Load the pre-trained sentiment analysis model
# This model is great for general-purpose sentiment analysis.
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Test it on a sample review
sample_review = df['cleaned_text'].iloc[0]
result = sentiment_pipeline(sample_review)

print(f"Review: '{sample_review}'")
print(f"Predicted Sentiment: {result}")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


Review: 'best book ever read absolutely captivating'
Predicted Sentiment: [{'label': 'POSITIVE', 'score': 0.9998810291290283}]
