In [88]:
# Import required libraries
import pandas as pd
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

In [89]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [90]:
# Load dataset
df = pd.read_excel('/content/Book1.xlsx')

In [91]:
# Data cleaning
df.dropna(subset=['review', 'sentiment'], inplace=True)
df.drop_duplicates(subset=['review'], inplace=True)

In [92]:
# Balance dataset
min_count = df['sentiment'].value_counts().min()
df = df.groupby('sentiment').sample(n=min_count, random_state=42)
df = shuffle(df, random_state=42).reset_index(drop=True)


In [93]:
# Enhanced sarcasm detection function
def detect_sarcasm(text):
    sarcasm_keywords = [
        # Common phrases
        "just what i needed", "truly revolutionary", "absolutely nothing",
        "how wonderful", "i love how it", "what a surprise", "exactly what i hoped for",
        "nailed it", "couldn't be worse", "brilliant work", "flawless fail", "amazing crash",
        # Ironic phrases
        "great job crashing", "awesome bug", "smooth experience (not)", "fantastic... not",
        "because why not", "works like a charm", "perfectly useless", "so helpful", "thanks a lot",
        # Contradictions
        "beautiful design but", "sleek and fast but", "fast but useless", "love it when it fails",
        "excellent bug collection", "fails beautifully", "didn't expect much and still disappointed",
        # Tone indicators
        "as if", "yeah right", "sure thing", "whatever", "big surprise"
    ]

    # Check for exaggerated positive words in negative context
    positive_words = ["amazing", "awesome", "fantastic", "perfect", "brilliant", "wonderful"]
    negative_context = ["crash", "fail", "bug", "slow", "broken", "useless", "terrible"]

    text_lower = text.lower()

    # Check for sarcasm keywords
    if any(phrase in text_lower for phrase in sarcasm_keywords):
        return True

    # Check for exaggerated positive words in negative context
    if any(word in text_lower for word in positive_words) and any(word in text_lower for word in negative_context):
        return True

    # Check for contrastive conjunctions (but, however, although)
    if re.search(r'\b(but|however|although|yet)\b', text_lower, re.IGNORECASE):
        return True

    return False

In [94]:
# Apply sarcasm detection
df['is_sarcastic'] = df['review'].apply(detect_sarcasm)


In [95]:
# Text preprocessing
def preprocess_text(text):
    # Lowercase and remove punctuation (keeping emoticons)
    text = text.lower()
    text = re.sub(r'[^\w\s\'!?]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Custom stopwords removal (preserving negation words)
    stop_words = set(stopwords.words('english'))
    exclusion_words = ['not','no', 'never', 'none', 'nobody', 'nothing', 'neither',
                      'nowhere', 'hardly', 'scarcely', 'barely', 'doesnt', 'dont',
                      'cant', 'couldnt', 'wont', 'wouldnt', 'shouldnt', 'isnt',
                      'wasnt', 'werent', 'hasnt', 'hadnt']

    for word in exclusion_words:
        stop_words.discard(word)

    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization with POS tagging
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(tokens)

    lemmatized_tokens = []
    for word, tag in pos_tags:
        pos = get_wordnet_pos(tag)
        lemmatized_tokens.append(lemmatizer.lemmatize(word, pos))

    return ' '.join(lemmatized_tokens)

In [96]:
def get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

In [97]:
# Apply preprocessing
df['processed_review'] = df['review'].apply(preprocess_text)

In [98]:
# Map sentiments to numerical labels
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
inv_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
df['label'] = df['sentiment'].map(label_map)

In [99]:
# Feature extraction with TF-IDF (including bigrams)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = vectorizer.fit_transform(df['processed_review'])
y = df['label']

In [100]:
# Train-test split
X_train, X_test, y_train, y_test, sarcasm_train, sarcasm_test = train_test_split(
    X, y, df['is_sarcastic'], test_size=0.2, random_state=42)

In [101]:
# Train model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

In [102]:
# Predict with sarcasm adjustment
raw_preds = model.predict(X_test)

In [103]:
# Adjust predictions for sarcastic reviews
final_preds = []
for pred, is_sarcastic in zip(raw_preds, sarcasm_test.tolist()):
    if is_sarcastic and pred != 0:  # If sarcastic and not already negative
        final_preds.append(0)  # Force to negative
    else:
        final_preds.append(pred)

In [104]:
# Evaluation
print("\nClassification Report (After Sarcasm Correction):\n")
print(classification_report(y_test, final_preds, target_names=['negative', 'neutral', 'positive']))
print("Accuracy:", accuracy_score(y_test, final_preds))



Classification Report (After Sarcasm Correction):

              precision    recall  f1-score   support

    negative       0.76      0.97      0.85        39
     neutral       0.90      0.78      0.83        45
    positive       0.98      0.88      0.93        51

    accuracy                           0.87       135
   macro avg       0.88      0.88      0.87       135
weighted avg       0.89      0.87      0.87       135

Accuracy: 0.8740740740740741


In [105]:
# Interactive prediction function
def predict_sentiment():
    print("\nEnter your review (type 'exit' to stop):")
    while True:
        user_review = input("Your Review: ")
        if user_review.lower() == 'exit':
            break

        # Detect sarcasm
        is_sarcastic = detect_sarcasm(user_review)

        # Preprocess text
        processed_text = preprocess_text(user_review)
        features = vectorizer.transform([processed_text])

        # Predict
        pred = model.predict(features)[0]

        # Adjust for sarcasm
        if is_sarcastic and pred != 0:
            pred = 0

        # Get confidence scores
        confidence_scores = model.predict_proba(features)[0]
        confidence = round(confidence_scores[pred] * 100, 2)

        # Get sentiment label
        label = inv_label_map[pred]

        # Print results
        print(f"→ Prediction: {label} (confidence: {confidence}%)")
        print(f"→ Sarcasm detected: {is_sarcastic}")

        # If sarcastic, show original prediction
        if is_sarcastic:
            original_pred = model.predict(features)[0]
            print(f"→ Original prediction (before sarcasm adjustment): {inv_label_map[original_pred]}")

        print()

In [106]:
# Run interactive prediction
predict_sentiment()


Enter your review (type 'exit' to stop):
Your Review: fuck you all
→ Prediction: negative (confidence: 37.63%)
→ Sarcasm detected: False

Your Review: bad
→ Prediction: neutral (confidence: 38.75%)
→ Sarcasm detected: False

Your Review: good
→ Prediction: neutral (confidence: 90.16%)
→ Sarcasm detected: False

Your Review: very good
→ Prediction: neutral (confidence: 90.16%)
→ Sarcasm detected: False

Your Review: love it
→ Prediction: positive (confidence: 77.4%)
→ Sarcasm detected: False

Your Review: hate it
→ Prediction: negative (confidence: 37.63%)
→ Sarcasm detected: False

Your Review: This app is perfect!
→ Prediction: positive (confidence: 84.65%)
→ Sarcasm detected: False

Your Review: It keeps crashing
→ Prediction: negative (confidence: 59.45%)
→ Sarcasm detected: False

Your Review: Does what it needs to do
→ Prediction: neutral (confidence: 46.55%)
→ Sarcasm detected: False

Your Review: Oh great, another update that made it slower. Just what I needed
→ Prediction: neg