In [35]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import numpy as np


# For converting text into vecors
from sklearn.feature_extraction.text import TfidfVectorizer  

# Getting a simple regression model to train 
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import accuracy_score 

In [39]:
nltk.download("stopwords")  
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hughes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
# Load first 50,000 samples to save memory
def load_data(filename, num_samples=50000): 
    reviews = []
    sentiments = []

    with open(filename, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= num_samples:    # Changed from 'line >= num_samples'
                break
            parts = line.strip().split(" ", 1) # Split at the first Space
            if len(parts) == 2:
                label, text = parts
                sentiment = 1 if label == "__label__2" else 0
                reviews.append(text)
                sentiments.append(sentiment)    # Fixed typo in 'append'

    return pd.DataFrame({"reviews": reviews, "sentiment": sentiments})

# Load both train and test data
df_train = load_data("/Users/hughes/Downloads/amazon_data/train.ft.txt")
df_test = load_data("/Users/hughes/Downloads/amazon_data/test.ft.txt")

# Show the first few rows
print(df_train.head())

                                             reviews  sentiment
0  Stuning even for the non-gamer: This sound tra...          1
1  The best soundtrack ever to anything.: I'm rea...          1
2  Amazing!: This soundtrack is my favorite music...          1
3  Excellent Soundtrack: I truly like this soundt...          1
4  Remember, Pull Your Jaw Off The Floor After He...          1


In [42]:
def clean_text(text):
     text = re.sub(r"[^a-zA-Z\s]", "", text.lower())  # Remove special characters & convert to lowercase
     words = text.split()
     words = [word for word in words if word not in stop_words]
     return " ".join(words)

# Apply cleaning function to both train and test data
df_train["clean_reviews"] = df_train["reviews"].apply(clean_text)
df_test["clean_reviews"] = df_test["reviews"].apply(clean_text)

y_train = df_train["sentiment"]  # Target labels for training
y_test = df_test["sentiment"]    # Target labels for testing


# Show first few rows after cleaning
print(df_train[["reviews", "clean_reviews"]].head())

                                             reviews  \
0  Stuning even for the non-gamer: This sound tra...   
1  The best soundtrack ever to anything.: I'm rea...   
2  Amazing!: This soundtrack is my favorite music...   
3  Excellent Soundtrack: I truly like this soundt...   
4  Remember, Pull Your Jaw Off The Floor After He...   

                                       clean_reviews  
0  stuning even nongamer sound track beautiful pa...  
1  best soundtrack ever anything im reading lot r...  
2  amazing soundtrack favorite music time hands i...  
3  excellent soundtrack truly like soundtrack enj...  
4  remember pull jaw floor hearing youve played g...  


In [43]:
# Convert Cleaned Text to Numbers
# Since Ml models only understand numbers, we convert words into TF-Idf vectors.
vectorizer = TfidfVectorizer(max_features= 5000) # Convert words to numerical vectors  
X_train = vectorizer.fit_transform(df_train["clean_reviews"])
X_test = vectorizer.transform(df_test["clean_reviews"])


In [44]:
# Train a Simple ML Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Check Accuracy  
accuracy = accuracy_score(y_test, y_pred)  
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8744


In [45]:
# Calculate the sentiment percentages for positive, negative, and neutral reviews
positive_reviews = df_test[df_test['sentiment'] == 1]
negative_reviews = df_test[df_test['sentiment'] == 0]

positive_percentage = len(positive_reviews) / len(df_test) * 100
negative_percentage = len(negative_reviews) / len(df_test) * 100
neutral_percentage = 100 - positive_percentage - negative_percentage


print(f"Positive Sentiment: {positive_percentage:.2f}%")
print(f"Negative Sentiment: {negative_percentage:.2f}%")
print(f"Neutral Sentiment: {neutral_percentage:.2f}%")

Positive Sentiment: 50.75%
Negative Sentiment: 49.25%
Neutral Sentiment: -0.00%


In [46]:
# Extraction of Key Themes
# Vectorize the cleaned reviews using TF-IDF
vectorized = TfidfVectorizer(max_features=10) #Top 10 words
X = vectorized.fit_transform(df_train["clean_reviews"])

# Get Feature names (words)
words = np.array(vectorized.get_feature_names_out())

# Get the highest TF-IDF values (scores)
tfidf_scores = np.array(X.sum(axis=0)).flatten()

# Get the top words based on TF-IDF score
top_words = [words[i] for i in tfidf_scores.argsort()[-10:][::-1]]  # Top 10 words
top_scores = sorted(tfidf_scores, reverse=True)[:10]

print(f"Top 10 Key Themes/Words in Reviews: {top_words}")

Top 10 Key Themes/Words in Reviews: ['book', 'one', 'great', 'good', 'like', 'movie', 'would', 'read', 'time', 'get']


In [47]:
def sentiment_analysis_report(df_train, df_test):
    # Sentiment Percentages
    positive_reviews = df_test[df_test['sentiment'] == 1]
    negative_reviews = df_test[df_test['sentiment'] == 0]

    positive_percentage = len(positive_reviews) / len(df_test) * 100
    negative_percentage = len(negative_reviews) / len(df_test) * 100
    neutral_percentage = 100 - (positive_percentage + negative_percentage)

    # Key Themes Extraction
    vectorizer = TfidfVectorizer(max_features=10)
    X = vectorizer.fit_transform(df_train["clean_reviews"])
    words = np.array(vectorizer.get_feature_names_out())
    tfidf_scores = np.array(X.sum(axis=0)).flatten()

    top_words = [words[i] for i in tfidf_scores.argsort()[-10:][::-1]]
    top_scores = sorted(tfidf_scores, reverse=True)[:10]

    # Enhanced Report Output
    print("="*50)
    print("\n📊 SENTIMENT ANALYSIS REPORT 📊".center(50))
    print("="*50 + "\n")
    
    print("📈 SENTIMENT DISTRIBUTION")
    print("-"*30)
    print(f"📗 Positive Reviews: {positive_percentage:>6.2f}%")
    print(f"📕 Negative Reviews: {negative_percentage:>6.2f}%")
    print(f"📘 Neutral Reviews:  {neutral_percentage:>6.2f}%")
    
    print("\n🔑 TOP 10 KEY THEMES")
    print("-"*30)
    for i, (word, score) in enumerate(zip(top_words, top_scores), 1):
        print(f"{i:2d}. {word:<15} | Score: {score:.4f}")
    
    print("\n" + "="*50)

# Generate the report
sentiment_analysis_report(df_train, df_test)

          
📊 SENTIMENT ANALYSIS REPORT 📊          

📈 SENTIMENT DISTRIBUTION
------------------------------
📗 Positive Reviews:  50.75%
📕 Negative Reviews:  49.25%
📘 Neutral Reviews:    0.00%

🔑 TOP 10 KEY THEMES
------------------------------
 1. book            | Score: 10209.0684
 2. one             | Score: 7793.6119
 3. great           | Score: 6704.3770
 4. good            | Score: 6239.9403
 5. like            | Score: 6222.4218
 6. movie           | Score: 5559.9327
 7. would           | Score: 5261.8505
 8. read            | Score: 5098.5915
 9. time            | Score: 4576.2034
10. get             | Score: 4486.4289

