In [38]:
# Import libraries
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

In [39]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [40]:
# Load your dataset
df = pd.read_excel('/content/Book1.xlsx')

In [41]:
# Drop rows with missing data
df.dropna(subset=['review', 'sentiment'], inplace=True)

In [42]:
# Remove duplicates
df.drop_duplicates(subset=['review'], inplace=True)

In [43]:
# Balance dataset
min_count = df['sentiment'].value_counts().min()
df = df.groupby('sentiment').sample(n=min_count, random_state=42)
df = shuffle(df, random_state=42).reset_index(drop=True)


In [44]:
# Lowercase and remove punctuation
df['review'] = df['review'].str.lower().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [46]:
# Stopwords without removing 'not'
stop_words = set(stopwords.words('english'))
exclusion_words = ['not','no', 'never', 'none', 'nobody', 'nothing', 'neither',
                      'nowhere', 'hardly', 'scarcely', 'barely', 'doesnt', 'dont',
                      'cant', 'couldnt', 'wont', 'wouldnt', 'shouldnt', 'isnt',
                      'wasnt', 'werent', 'hasnt', 'hadnt']

for word in exclusion_words:
    stop_words.discard(word)

In [47]:
# Tokenization and stopwords removal
df['review'] = df['review'].apply(word_tokenize)
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

In [48]:
# Lemmatization with POS tagging
lemmatizer = WordNetLemmatizer()

In [49]:
def get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

In [50]:
def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags])

df['review'] = df['review'].apply(lemmatize_text)

In [51]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=3000)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [52]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# Use Logistic Regression for better precision
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [54]:
# Predictions and evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9333333333333333
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.97      0.94        39
     neutral       0.91      0.96      0.93        45
    positive       0.98      0.88      0.93        51

    accuracy                           0.93       135
   macro avg       0.93      0.94      0.93       135
weighted avg       0.94      0.93      0.93       135



In [55]:
# Function to clean and predict new input
def preprocess_and_predict(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    text = lemmatize_text(text)
    features = vectorizer.transform([text])
    return model.predict(features)[0]

In [63]:
# Input for real-time prediction
new_review = input("Enter your review: ")
predicted_sentiment = preprocess_and_predict(new_review)
print("Predicted Sentiment:", predicted_sentiment)

Enter your review: bad
Predicted Sentiment: negative
