In [1]:
!pip install nltk



In [2]:
# Import required libraries
import nltk
from nltk.corpus import movie_reviews
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
import random


In [11]:
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

print("Total Documents:", len(documents))
print("Sample Document Words:", documents[0][0][:20])
print("Category:", documents[0][1])


Total Documents: 2000
Sample Document Words: ['of', 'circumcision', ',', 'psychic', 'wounds', 'and', 'the', 'family', 'sitcom', 'the', 'opening', 'segment', 'is', 'something', 'of', 'a', 'foretaste', 'of', 'this', 'film']
Category: pos


In [13]:
all_words = FreqDist(w.lower() for w in movie_reviews.words())

word_features = list(all_words)[:2000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]

print("Total Feature Sets:", len(featuresets))


Total Feature Sets: 2000


In [14]:
train_size = int(len(featuresets) * 0.8)
train_set = featuresets[:train_size]
test_set = featuresets[train_size:]

print("Training samples:", len(train_set))
print("Testing samples:", len(test_set))


Training samples: 1600
Testing samples: 400


In [15]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy = nltk_accuracy(classifier, test_set)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 82.25%


In [16]:
classifier.show_most_informative_features(10)


Most Informative Features
             outstanding = True              pos : neg    =     14.4 : 1.0
                  seagal = True              neg : pos    =     13.0 : 1.0
                   mulan = True              pos : neg    =      8.3 : 1.0
             wonderfully = True              pos : neg    =      6.4 : 1.0
               fantastic = True              pos : neg    =      6.2 : 1.0
                  poorly = True              neg : pos    =      5.3 : 1.0
                     era = True              pos : neg    =      5.1 : 1.0
                   awful = True              neg : pos    =      5.1 : 1.0
              ridiculous = True              neg : pos    =      5.0 : 1.0
                    lame = True              neg : pos    =      5.0 : 1.0


In [17]:
from nltk.tokenize import word_tokenize

def predict_sentiment(text):
    words = word_tokenize(text)
    features = find_features(words)
    return classifier.classify(features)

# Test examples
print("\nCustom Predictions:")
print("1. 'This movie was amazing and full of joy.' →", predict_sentiment("This movie was amazing and full of joy."))
print("2. 'The plot was boring and the acting was terrible.' →", predict_sentiment("The plot was boring and the acting was terrible."))
print("3. 'It was okay, not too bad but not great either.' →", predict_sentiment("It was okay, not too bad but not great either."))



Custom Predictions:
1. 'This movie was amazing and full of joy.' → neg
2. 'The plot was boring and the acting was terrible.' → neg
3. 'It was okay, not too bad but not great either.' → neg
