In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
imdbreviews=pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
imdbreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
imdbreviews['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
# Dependent Variable(y) is sentiment. It is non numeric and binary.
# Classification models to be built. Binary Logistic regression is classification
# algorithm widely used

# Binary Logistic regression explains relationship between a binary dependent
# variable(y) and multiple independent variables(X's).
# Binary Logistic Regression outcome is an equation
#       e^B0+B1X1+B2X2+B3X3+...........+BnXn                          1
#. p = ----------------------------------------or Sigmoid = -------------------------
#.      1+ e^B0+B1X1+B2X2+B3X3+...........+BnXn            1+e^-(B0+B1X1+B2X2..BnXn)
# p is probability.  e is exponential or 2.718 . B0 is intercept
# B1,B2, B3,...Bn - Coefficients. X1,X2,X3,...Xn - Independent Variables

# Classification based on predicted probability
# probability greater than equal to 0.50 ---- 1 or Yes or True
# probability less than 0.50 ------ 0.  or No or False

# Assumptions are:
# a) dependent variable(y) must be binary
# b) Independent Variables(X) can be both numerical and non numerical
# c) There must be logical relationship between dependent variable and independent
# variables
# d) No Multicollinearity
# e) Sample size required is minimum 50 observations per variable

# Interpretation of output based on Confusion matrix
# Confusion Matrix is a 2 X 2 matrix or cross tabulation of actual and predicted
# class
# Primary metric is Accuracy of model which must be between 0.70 - 0.95
#                         True Positive + True Negative
# Accuracy = ------------------------------------------------------------------
#             True Positive + False Negative + False Positive + True Negative

# Other metrics that are calculated from confusion matrix are:-
# Sensitivity or Recall = TP/(TP+FN) . how many positives predicted positive
# Specificty = TN/(TN+FP). how many negatives predicted negative
# Precision = TP/(TP+FP). Overall positive prediction rate.
# F1 Score = 2 X ((Precision X Recall)/(Precision + Recall))

In [None]:
imdbreviews['review'].head(3) # Text data must be converted to matrix of numbers

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...


In [None]:
imdbreviews['review']=imdbreviews['review'].str.lower()

In [None]:
imdbreviews['review']=imdbreviews['review'].str.replace(r'a-zA-Z0-9\w\s.','')

In [None]:
imdbreviews['review']=imdbreviews['review'].str.replace(r'<br />','')

In [None]:
imdbreviews['review']=imdbreviews['review'].str.replace(r'\d+','',regex=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf=TfidfVectorizer(max_features=300,stop_words="english",ngram_range=(2,2))

In [None]:
X_tfidf=tfidf.fit_transform(imdbreviews['review'])

In [None]:
X_tfidf=pd.DataFrame(X_tfidf.toarray(),columns=tfidf.get_feature_names_out())

In [None]:
X_tfidf.head(2)

Unnamed: 0,acting bad,acting good,action movie,action scenes,action sequences,bad acting,bad bad,bad film,bad good,bad guy,...,worth seeing,worth watching,writer director,year old,years ago,years later,years old,young girl,young man,young woman
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.preprocessing import LabelEncoder

The LabelEncoder is used to convert categorical text labels into numerical format. In this notebook, the 'sentiment' column contains string values ('positive' and 'negative'). Machine learning models, such as the Logistic Regression model being built here, generally require numerical input. LabelEncoder transforms these categorical labels into integers, typically 0 and 1, which the model can then process.

In [None]:
y=LabelEncoder().fit_transform(imdbreviews['sentiment'])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_model=LogisticRegression().fit(X_tfidf,y)

In [None]:
logistic_model.score(X_tfidf,y)

0.6961

In [None]:
logistic_predict=logistic_model.predict(X_tfidf)

In [None]:
pd.crosstab(y,logistic_predict)
# 0 - Negative  & 1 - Positive
# True Negative-15508,False Positive-9492,False Negative-5703,True Positive-19297

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,15508,9492
1,5703,19297


In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y,logistic_predict))

              precision    recall  f1-score   support

           0       0.73      0.62      0.67     25000
           1       0.67      0.77      0.72     25000

    accuracy                           0.70     50000
   macro avg       0.70      0.70      0.69     50000
weighted avg       0.70      0.70      0.69     50000



In [None]:
!pip install streamlit
import streamlit as st
import numpy as np
import joblib

# --------------------------------------------------
# Load your trained model and vectorizer
# (Adjust these paths/names to match your file)
# --------------------------------------------------

# Streamlit UI
st.title("üé¨ IMDB Review Sentiment Analyzer")
st.write("Enter a movie review below and the model will predict its sentiment.")

# Text input box
user_review = st.text_area("Write a review:", height=200)

# Predict button
if st.button("Analyze Sentiment"):
    if user_review.strip() == "":
        st.warning("Please enter a review.")
    else:
        # Transform the text
        X_new = tfidf.transform([user_review])

        # Predict sentiment
        prediction = logistic_model.predict(X_new)[0]

        # Predict probability
        prob = logistic_model.predict_proba(X_new)[0]
        pos_prob = prob[1]
        neg_prob = prob[0]

        # Display results
        st.subheader("Sentiment Result")
        sentiment_text = "üëç Positive" if prediction == 1 else "üëé Negative"
        st.write(f"**Prediction:** {sentiment_text}")

        st.subheader("Probability Scores")
        st.write(f"**Positive Probability:** {pos_prob:.4f}")
        st.write(f"**Negative Probability:** {neg_prob:.4f}")


Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.52.1


2025-12-07 06:03:21.687 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-12-07 06:03:21.702 Session state does not function when running a script without `streamlit run`
