In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. Download the Dataset

In [None]:
!kaggle datasets download -d bittlingmayer/amazonreviews


Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 98% 482M/493M [00:04<00:00, 105MB/s]
100% 493M/493M [00:04<00:00, 118MB/s]


In [None]:
# Unzip the dataset
!unzip /content/drive/MyDrive/amazonreviews-dataset/amazonreviews.zip -d /content/drive/MyDrive/amazonreviews-dataset

Archive:  /content/drive/MyDrive/amazonreviews-dataset/amazonreviews.zip
  inflating: /content/drive/MyDrive/amazonreviews-dataset/test.ft.txt.bz2  
  inflating: /content/drive/MyDrive/amazonreviews-dataset/train.ft.txt.bz2  


Set Up Your Environment

In [2]:
!pip install transformers pandas




Load and Decompress the Dataset

In [3]:
import pandas as pd
import bz2
from transformers import pipeline
from sklearn.metrics import classification_report

gdrive_dir = '/content/drive/MyDrive/amazonreviews-dataset/'
train_dir = gdrive_dir + 'train.ft.txt.bz2'
test_dir = gdrive_dir + 'test.ft.txt.bz2'

with bz2.open(train_dir, 'rt') as f:
    data = f.readlines()

# with bz2.BZ2File(train_dir, 'rb') as f:
#     data = f.read().decode('utf-8').splitlines()

# Check the total lines and the first few entries
print("Total lines in dataset:", len(data))
for line in data[:5]:
    print(line.strip())

# Prepare valid reviews
reviews = []
for line in data:
    # Split the line on the first space to separate the label from the review
    parts = line.strip().split(' ', 1)  # Split only on the first space
    if len(parts) == 2:  # Check if the line has the expected number of columns
        reviews.append(parts)

# Create DataFrame from valid reviews
df = pd.DataFrame(reviews, columns=['sentiment', 'reviewText'])

# Check the resulting DataFrame
print("DataFrame shape:", df.shape)
print("DataFrame head:")
print(df.head())

Total lines in dataset: 3600000
__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.
__label__

Preprocess the Data Clean and label the sentiment based on the dataset. The first column usually indicates the sentiment label.

In [4]:
# Check the unique sentiment labels
#print(df['sentiment'].unique())

# Map sentiments to a simpler format
sentiment_mapping = {
    '__label__1': 'negative',
    '__label__2': 'positive'
}

df['mapped_sentiment'] = df['sentiment'].map(sentiment_mapping)
print(df['mapped_sentiment'])


0          positive
1          positive
2          positive
3          positive
4          positive
             ...   
3599995    negative
3599996    negative
3599997    negative
3599998    negative
3599999    positive
Name: mapped_sentiment, Length: 3600000, dtype: object


Load the Pre-trained Sentiment Analysis Model Use the nlptown/bert-base-multilingual-uncased-sentiment model for sentiment analysis.

In [5]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Analyze Sentiment Use the model to classify the sentiment of the reviews.

In [None]:
# Prepare the reviews for sentiment analysis
review_texts = df['reviewText'].tolist()
print(review_texts)

# Analyze sentiment
predictions = sentiment_pipeline(review_texts)

# Extract sentiment labels and scores
df['predicted_sentiment'] = [pred['label'] for pred in predictions]
df['predicted_score'] = [pred['score'] for pred in predictions]

# Define your labels explicitly if necessary
target_names = ['negative', 'positive']  # Modify based on your classes

# Evaluate the model
print("Mapped Sentiment:", df['mapped_sentiment'].unique())
print("Predicted Sentiment:", df['predicted_sentiment'].unique())
#classification_report: This function provides precision, recall, F1-score, and support for each class.
print(classification_report(df['mapped_sentiment'], df['predicted_sentiment'], target_names=target_names))

# Display the first few results
print(df[['reviewText', 'mapped_sentiment', 'predicted_sentiment', 'predicted_score']].head())

# Calculate and print overall accuracy
#accuracy_score: This function calculates the overall accuracy of the model.
accuracy = accuracy_score(df['mapped_sentiment'], df['predicted_sentiment'])
print("Accuracy:", accuracy)

In [None]:
# Save the results to a CSV file
df.to_csv('amazon_reviews_with_sentiment.csv', index=False)
