In [1]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Srikanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Srikanth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
#Lemmatization
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Srikanth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
#Preprocessing Function
def preprocess_text(text):
    text = text.lower()                 #Converting the review in lowercase
    text = re.sub(r'<.*?>', '', text)   #Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  #Remove special Characters
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [None]:
df['cleaned_review'] = df['review'].apply(preprocess_text)  #We are using apply fn, it will take the review and passed to preprocess text fn 
print(df[['review', 'cleaned_review']].head(1).values[0])

In [None]:
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment'].apply(lambda x :1 if x =='positive' else 0)
print("TF-IDF Matrix Shape:", X.shape)
print("Sample TD-IDF Row: ", X[0][:10])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print ("Training Set Size:", X_train.shape)
# print ("Testing Set Size:", X_test.shape)
print(X_test)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
print("Model trained successfully!!")

In [None]:
y_pred = model.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred)*100)

In [None]:
def predict_sentiment(review_text):
    #preprocess the input
    cleaned = preprocess_text(review_text)
    #convert to TD-IDF
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    sentiment = "Positive ☺️" if prediction == 1 else "Negative 😔"
    return sentiment



In [None]:
#Example 1
input_review = "The movie was fantastic!! The performances were top-notch"
print("Review: ",input_review)
print("Predicted Sentiment: ",predict_sentiment(input_review))

#Example 2
input_review = "That was a terrible movie. Boring and repeating plot"
print("\nReview: ",input_review)
print("Predicted Sentiment: ",predict_sentiment(input_review))

In [None]:
import joblib

joblib.dump()