In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl


In [1]:
import spacy 
import pandas as pd
import re

In [3]:
nlp = spacy.load("en_core_web_lg")
df = pd.read_csv(r"/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [2]:
def preprocessing(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.strip().lower()  # Normalize whitespace and lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text =re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    doc = nlp(text)
    preprocessed_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        preprocessed_tokens.append(token.lemma_)
    return " ".join(preprocessed_tokens)


In [5]:
from tqdm import tqdm
tqdm.pandas()
df['preprocessed_review'] = df['review'].progress_apply(lambda text: preprocessing(text))
df.head(10)

100%|██████████| 50000/50000 [25:32<00:00, 32.63it/s]


Unnamed: 0,review,sentiment,preprocessed_review
0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch oz episode ll hook righ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically s family little boy jake think s zom...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love time money visually stunn...
5,"Probably my all-time favorite movie, a story o...",positive,probably time favorite movie story selflessnes...
6,I sure would like to see a resurrection of a u...,positive,sure like resurrection date seahunt series tec...
7,"This show was an amazing, fresh & innovative i...",negative,amazing fresh innovative idea s air year brill...
8,Encouraged by the positive comments about this...,negative,encourage positive comment film look forward w...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrench laughter like movie y...


In [6]:
df['sentiment'] = df['sentiment'].progress_apply(lambda x: 1 if x=="positive" else 0)
df.head(10)

100%|██████████| 50000/50000 [00:00<00:00, 1227395.21it/s]


Unnamed: 0,review,sentiment,preprocessed_review
0,One of the other reviewers has mentioned that ...,1,reviewer mention watch oz episode ll hook righ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,0,basically s family little boy jake think s zom...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei s love time money visually stunn...
5,"Probably my all-time favorite movie, a story o...",1,probably time favorite movie story selflessnes...
6,I sure would like to see a resurrection of a u...,1,sure like resurrection date seahunt series tec...
7,"This show was an amazing, fresh & innovative i...",0,amazing fresh innovative idea s air year brill...
8,Encouraged by the positive comments about this...,0,encourage positive comment film look forward w...
9,If you like original gut wrenching laughter yo...,1,like original gut wrench laughter like movie y...


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
clf=Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('gbc', GradientBoostingClassifier())
])
X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_review'],
    df['sentiment'], 
    test_size=0.2, 
    random_state=42)

In [9]:
clf.fit(X_train,y_train)

In [10]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.75      0.80      4961
           1       0.78      0.86      0.82      5039

    accuracy                           0.81     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.81      0.81      0.81     10000



In [13]:
from joblib import dump, load
dump(clf, r'/kaggle/working/sentiment_model.joblib')

['/kaggle/working/sentiment_model.joblib']

In [3]:
import streamlit as st
st.title("Sentiment Analysis of Movie Reviews")
st.write("This is a simple web app to analyze the sentiment of movie reviews.")

user_input = st.text_area("Enter a movie review:")
if st.button("Analyze"):
    # Load the model
    clf = load('sentiment_model.joblib')
    # Preprocess the input
    processed_input = preprocessing(user_input)
    # Make a prediction
    prediction = clf.predict([processed_input])
    st.write("Sentiment:", "Positive" if prediction == 1 else "Negative")

2025-10-29 23:49:08.067 
  command:

    streamlit run C:\Users\gidge\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-10-29 23:49:08.073 Session state does not function when running a script without `streamlit run`
