In [1]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import random

In [2]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [5]:
reviews = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [7]:
random.shuffle(reviews)

In [8]:
texts = [" ".join(review) for review, category in reviews]
labels = [category for review, category in reviews]

In [9]:
vectorizer = CountVectorizer(max_features=2000)
features = vectorizer.fit_transform(texts)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [11]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [12]:
y_pred = classifier.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 79.80%


In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.80      0.81      0.80       254
         pos       0.80      0.79      0.79       246

    accuracy                           0.80       500
   macro avg       0.80      0.80      0.80       500
weighted avg       0.80      0.80      0.80       500



In [15]:
print(confusion_matrix(y_test, y_pred))

[[205  49]
 [ 52 194]]
