In [27]:
# scrape real text from a webpage and analyze sentiment
import requests
from bs4 import BeautifulSoup
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, BatchNormalization, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import pipeline

In [4]:
def scrape_text(url):
    try: 
        response = requests.get(url, timeout = 10)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Remove unwanted script and style tags 
        for script in soup(["script", "style"]):
            script.decompose()
        # Extract visible text and clean whitespace
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        #return first 500 characters for demo 
        return text[:500]
    except Exception as e:
        return f"Error scraping : {str(e)}"
    

In [36]:
scraped_text= scrape_text("https://status.net/articles/examples-performance-reviews-good-satisfactory-poor/")


In [11]:
texts = [
    "This product is amazing! Highly recommend",
    "Worst purchase ever. Complete waste of money",
    "It's okay, nothing special but works fine",
    "Absolutely love it! Best quality",
    "Terrible experience. Very disappointed"
]

In [12]:
sentiments = [1,0,1,1,0]
# Labels : 1 = positive, 0 = negative

In [13]:
# TEXT PREPROCESSING FOR LSTM

In [16]:
max_words = 500 # Only keep 500 most frequent words
max_len = 20 # Limit each review to 20 words

In [19]:
# Tokenize text convert words - integer
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)


In [20]:
# Convert sentences to padded sequences 
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen = max_len)
y = np.array(sentiments)

In [28]:
model1 = Sequential([
    Embedding(max_words, 128, input_length=max_len),    # Higher embedding dimension
    SpatialDropout1D(0.3),                              # Drops entire word embeddings to improve robustness
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    BatchNormalization(),                               # Normalizes activations to speed up learning
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),       # Second LSTM layer for deeper temporal learning
    Dense(64, activation='relu', kernel_regularizer='l2'),
    Dropout(0.4),
    Dense(32, activation='relu', kernel_regularizer='l2'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')                      # Binary classification output
])



In [30]:
model1.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [32]:
# TRAIN THE MODEL 
history1 = model1.fit(X, y, epochs = 20, 
         batch_size = 2, # number of samples processed before the model updates, 
         validation_split = 0.2, # use 20% of training for validation
         verbose =1 # show progress during training)
         )

Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 1.0000 - loss: 1.6487 - val_accuracy: 0.0000e+00 - val_loss: 1.7228
Epoch 2/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.5000 - loss: 1.5771 - val_accuracy: 0.0000e+00 - val_loss: 1.7158
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 1.0000 - loss: 1.5193 - val_accuracy: 0.0000e+00 - val_loss: 1.7071
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.5000 - loss: 1.7790 - val_accuracy: 0.0000e+00 - val_loss: 1.7016
Epoch 5/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8333 - loss: 1.5386 - val_accuracy: 0.0000e+00 - val_loss: 1.6995
Epoch 6/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.5000 - loss: 1.8779 - val_accuracy: 0.0000e+00 - val_loss: 1.6940
Epoch 7/20
[1m2/2[0m 

In [34]:
# TRANSFORMERS (PRETRAINED )
transformer_sentiment = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")




Device set to use cpu


In [38]:
lstm_model = Sequential([
    Embedding(max_words, 16, input_length=max_len),
    LSTM(32),
    Dense(1, activation='sigmoid')
])



In [41]:
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [44]:
# LSTM PREDICTION 
test_seq = tokenizer.texts_to_sequences([scraped_text])
test_pad = pad_sequences(test_seq, maxlen = max_len)
lstm_pred = lstm_model.predict(test_pad, verbose = 0
                          )
lstm_sentiment = "Positive" if lstm_pred[0] > 0.5 else "Negative"
lstm_sentiment

'Negative'

In [51]:
transformer_result = transformer_sentiment(scraped_text[:512])[0]
print(f" LSTM Sentiment -> {lstm_sentiment} ({lstm_pred[0][0]:.2f})")

 LSTM Sentiment -> Negative (0.50)


In [52]:
print(f"  Transformer Sentiment → {transformer_result['label']} ({transformer_result['score']:.2f})")


  Transformer Sentiment → POSITIVE (0.98)


In [60]:
sample_reviews = [
    "Great product, works perfectly!",
    "Overall no complaints, decent for the price.",
    "Absolutely fantastic! I loved the design and performance.",
    "Disappointed with the quality, expected much better.",
    "Terrible customer service, won’t buy again.",
    "It's okay, does the job but nothing special.",
    "Very easy to use and setup, highly recommend!",
    "Not worth the money, broke after a week.",
    "Quality is average, but delivery was fast.",
    "Excellent value for money. Will purchase again!",
    "Horrible experience. The item arrived damaged.",
    "Amazing sound quality and comfortable fit.",
    "Mediocre at best, I’ve used better alternatives.",
    "Loved it! Exceeded my expectations.",
    "It stopped working after a few days — waste of money."
]

In [61]:
for review in sample_reviews: 
    seq= tokenizer.texts_to_sequences([review])
    pad = pad_sequences(seq, maxlen = max_len)
    lstm_p = lstm_model.predict(pad, verbose = 1)[0][0]
    trans_p = transformer_sentiment(review)[0]
    print(f"Review: {review}")
    print(f"LSTM {'Positive' if lstm_p > 0.5 else 'Negative'} ({lstm_p:.2f})")
    print(f"Transformer {trans_p['label']} ({trans_p['score']: .2f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Review: Great product, works perfectly!
LSTM Negative (0.50)
Transformer POSITIVE ( 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Review: Overall no complaints, decent for the price.
LSTM Positive (0.50)
Transformer POSITIVE ( 0.98)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Review: Absolutely fantastic! I loved the design and performance.
LSTM Positive (0.50)
Transformer POSITIVE ( 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Review: Disappointed with the quality, expected much better.
LSTM Negative (0.50)
Transformer NEGATIVE ( 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Review: Terrible customer service, won’t buy again.
LSTM Positive (0.50)
Transformer NEGATIVE ( 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Review: It's okay, does the job but nothi