In [174]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [175]:
import pandas as pd

In [176]:
dataframe = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [177]:
dataframe.iloc[6]

Unnamed: 0,6
review,I sure would like to see a resurrection of a u...
sentiment,positive


In [178]:
positive_df = dataframe[dataframe['sentiment'] == 'positive'].sample(50, random_state=42)
negative_df = dataframe[dataframe['sentiment'] == 'negative'].sample(50, random_state=42)

df = pd.concat([positive_df, negative_df]).sample(frac=1, random_state=42).reset_index(drop=True)
df.columns = ['Review', 'Sentiment']
print(df.head())



                                              Review Sentiment
0  I viewed the first two nights before coming to...  negative
1  It's been said several times - not least by me...  negative
2  Without a doubt this is the WORSE comicbook mo...  negative
3  Firstly, I would like to point out that people...  positive
4  a compact crime drama with a good amount of ac...  positive


In [179]:
df.shape

(100, 2)

In [180]:
df.duplicated().sum()

np.int64(0)

In [181]:
x = df['Review']
x

Unnamed: 0,Review
0,I viewed the first two nights before coming to...
1,It's been said several times - not least by me...
2,Without a doubt this is the WORSE comicbook mo...
3,"Firstly, I would like to point out that people..."
4,a compact crime drama with a good amount of ac...
...,...
95,It wasn't until I saw Sidney Pollack in the pi...
96,"For Daniel Auteuil, `Queen Margot' was much be..."
97,I got this movie from eBay mainly because I'm ...
98,Preposterous sequel stretches credibility to a...


In [182]:
df['has_html'] = df['Review'].str.contains(r'<.*?>')
print(df['has_html'].sum())  # Number of reviews with HTML


57


In [183]:
import re
df['Review'] = df['Review'].apply(lambda x: re.sub(r'<.*?>', '', x))


In [184]:
df['has_html'] = df['Review'].str.contains(r'<.*?>')
print(df['has_html'].sum())  # Number of reviews with HTML

0


In [185]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=500)
cv

In [186]:
y = df['Sentiment']

In [187]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

In [188]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y ,random_state=1,test_size=0.2)
x_train.shape

(80,)

In [189]:
x_train_bow = cv.fit_transform(x_train).toarray()
x_train_bow.shape


(80, 500)

In [190]:
x_test_bow = cv.transform(x_test).toarray()

In [191]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [192]:
gnb.fit(x_train_bow,y_train)

In [193]:
y_pred = gnb.predict(x_test_bow)

from sklearn.metrics import accuracy_score , confusion_matrix
accuracy_score(y_test,y_pred)

0.65

In [194]:
x_train_pred = gnb.predict(x_train_bow)
accuracy_score(y_train,x_train_pred)

0.9625

In [195]:
confusion_matrix(y_test,y_pred)

array([[7, 3],
       [4, 6]])

In [196]:
def predict_review_sentiment(model = gnb, vectorizer = cv, review=""):
    review_clean = review.lower()
    review_vector = vectorizer.transform([review_clean]).toarray()
    prediction = model.predict(review_vector)[0]

    return "positive" if prediction == 1 else "negative"

In [205]:
review1 = "I was waiting for this day for the longest time. I was a kid back in 2009 when this movie released. So never got to watch it back then. But now when it rereleased I booked the first day show to a IMAX show and oh boy was I blown away! This is nothing short of a masterpiece! It's beyond belief how a film like this could've been made. Every scene, every shot is perfection. You are transferred to a different world and become so engrossed in the film. Never have I attended a movie where at the end of it people got up from their seats and started clapping! Last time this happened was after Infinity War. That movie too was a damn brilliant one. I'm from India and this is legit rare, where people go crazy, so crazy after any English film. This was one experience that I will never forget. I'm definitely going for it again next weekend cuz just once was not enough.Every human on the planet needs to experience this magnificent work of art!"
predict_review_sentiment(gnb,cv,review1)

'positive'

In [208]:
review2 = "The only thing this movie did for me was make me soooo bored I actually started remembering the plot of FernGully. Which I hadn't seen in like 18 years. I couldn't even remember its name. When I arrived home I furiously searched the internet for this mystery movie that I had forgotten and that Avatar somehow reminded me of.Then I found it and realized that it's exactly the same plot, except that it's a movie for children - which incidentally AVATAR also is, it's just that it cost 500$ billion, or whatever, to make so they had to pretend like it was for adults, but it really isn't. It's a kid's movie, and a 2 hours too long one at that.Frankly, what surprised me most about it was that it gets such good reviews... from almost everyone. I'm going to sit and wonder why for a while."
predict_review_sentiment(gnb,cv,review2)

'negative'

In [207]:
review3 = "I saw this epic last night at the Empire Leicester Sq in London, which is a superb venue in which to view this film. Huge screen, excellent sound and an extraordinary Dolby, 3 dimensional image. The whole effect is mind blowing.This is a 'Must see' movie, innovative, and extraordinary. I think it will be regarded by most cinema goers as another milestone in the history of the art. The level of realism achieved is remarkable, and although the film is relatively long in real time, it retains it's excitement and holds the audience's attention to the end.Performances are good, but this is not the sort of film that dwells on big star value for the actors, although Sigorney Weaver does shine and delivers a very convincing performance, as do the rest of the cast. But as there is so much entertainment and action value on screen the human element does not dominate in the usual way.As Writer/Director, James Cameron deserves high praise for this creation and in my opinion it will break box office records. I thoroughly enjoyed this film."
predict_review_sentiment(gnb,cv,review3)

'positive'