In [4]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from google.colab import files
import io

uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['IMDB Dataset.csv']))

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")


print("Dataset sample:\n", df.head())

reviews = df["review"].head(5).tolist()

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):

    text = text.lower()


    text = re.sub(r"[^a-zA-Z\s]", "", text)


    tokens = text.split()


    tokens = [word for word in tokens if word not in stop_words]


    stemmed = [stemmer.stem(word) for word in tokens]


    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

    return {
        "original": text,
        "no_stopwords": tokens,
        "stemmed": stemmed,
        "lemmatized": lemmatized
    }

processed_reviews = [preprocess_text(review) for review in reviews]

for i, review in enumerate(processed_reviews):
    print(f"\nReview {i+1}")
    print("Raw:", reviews[i])
    print("Cleaned (no stopwords):", review["no_stopwords"])
    print("Stemmed:", review["stemmed"])
    print("Lemmatized:", review["lemmatized"])


Saving IMDB Dataset.csv to IMDB Dataset.csv


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Dataset sample:
                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Review 1
Raw: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuse