In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import zipfile

In [2]:
with zipfile.ZipFile('/content/IMDB Dataset.csv.zip') as zip_ref:
    zip_ref.extractall()

train_data = pd.read_csv('/content/IMDB Dataset.csv.zip')
test_data = pd.read_csv('/content/IMDB Dataset.csv.zip')

In [3]:
df = pd.concat([train_data, test_data])

In [4]:
print(df.head(20))

                                               review sentiment
0   One of the other reviewers has mentioned that ...  positive
1   A wonderful little production. <br /><br />The...  positive
2   I thought this was a wonderful way to spend ti...  positive
3   Basically there's a family where a little boy ...  negative
4   Petter Mattei's "Love in the Time of Money" is...  positive
5   Probably my all-time favorite movie, a story o...  positive
6   I sure would like to see a resurrection of a u...  positive
7   This show was an amazing, fresh & innovative i...  negative
8   Encouraged by the positive comments about this...  negative
9   If you like original gut wrenching laughter yo...  positive
10  Phil the Alien is one of those quirky films wh...  negative
11  I saw this movie when I was about 12 when it c...  negative
12  So im not a big fan of Boll's work but then ag...  negative
13  The cast played Shakespeare.<br /><br />Shakes...  negative
14  This a fantastic movie of three pris

In [6]:
df.tail(20)

Unnamed: 0,review,sentiment
49980,A stunning film of high quality.<br /><br />Ap...,positive
49981,"And I repeat, please do not see this movie! Th...",negative
49982,"To be hones, I used to like this show and watc...",negative
49983,"I loved it, having been a fan of the original ...",positive
49984,Hello it is I Derrick Cannon and I welcome you...,negative
49985,Imaginary Heroes is clearly the best film of t...,positive
49986,This movie is a disgrace to the Major League F...,negative
49987,A remake of Alejandro Amenabar's Abre los Ojos...,negative
49988,"When I first tuned in on this morning news, I ...",negative
49989,I got this one a few weeks ago and love it! It...,positive


In [7]:
df["sentiment"].value_counts()

sentiment
positive    50000
negative    50000
Name: count, dtype: int64

In [8]:
df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
df["sentiment"].value_counts()

sentiment
1    50000
0    50000
Name: count, dtype: int64

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['review'] = df['review'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [12]:
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
vectorizer = CountVectorizer(max_features=5000)
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [14]:
print(X_train)

25220    saw `` fever pitch '' sort accident ; playing ...
48955    good thing persepolis shadow created german ex...
44966    classic author c.s . lewis wrote essay stating...
13568    year madonna tried prove , public eye , act . ...
42727    watched movie last night bit disappointed . lo...
                               ...                        
6265     earth five u keep repeating one ? title actual...
4886     asterix viking first animated asterix movie 12...
26820    happens someone much social anxiety cease func...
860      production quite surprise . absolutely love ob...
15795    decent movie . although little bit short time ...
Name: review, Length: 80000, dtype: object


In [15]:
print(X_test)

25721    united state still fighting world war ii ( mov...
30184    gregory peck give brilliant performance film ....
19864    watched last night seen several year . really ...
26699    caught recently noticing james earl jones name...
42991    ensemble piece adult return formulative summer...
                               ...                        
32595    give film 's prop well made reasonably well ac...
29313    mean really , really , really high movie shot ...
37862    picture bride excellent look hawaii 's past pe...
3421     ben ( fine charles bateman ) , young daughter ...
42410    late eighty early ninety decline death indepen...
Name: review, Length: 20000, dtype: object


In [16]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [17]:
print(Y_train)

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


In [None]:
classifier = LogisticRegression()
classifier.fit(X_train_count, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = classifier.predict(X_test_count)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9086
Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.90      0.91      9960
    positive       0.91      0.91      0.91     10040

    accuracy                           0.91     20000
   macro avg       0.91      0.91      0.91     20000
weighted avg       0.91      0.91      0.91     20000

Confusion Matrix:
[[9005  955]
 [ 873 9167]]


In [None]:
new_review = "I loved this movie! The acting was superb and the storyline was engaging."
new_review = preprocess_text(new_review)
new_review_count = vectorizer.transform([new_review])
print("Predicted Sentiment:", classifier.predict(new_review_count)[0])

Predicted Sentiment: positive
