# **Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

#Importing Libararies



In [2]:
!pip install nltk pandas



In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Loading Data set

In [5]:
df = pd.read_csv("/content/airlines_reviews.csv", usecols=['Reviews', 'Verified'])
df = df.dropna().head(3000)   # take 3000 rows for speed
df.head()


Unnamed: 0,Verified,Reviews
0,True,Flight was amazing. The crew onboard this fl...
1,True,Booking an emergency exit seat still meant h...
2,True,Excellent performance on all fronts. I would...
3,True,Pretty comfortable flight considering I was f...
4,True,The service was consistently good from start ...


#Data Preprocessing

In [7]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [12]:
import nltk
nltk.download('punkt_tab')

df["tokens"] = df["Reviews"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Reviews"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Reviews"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Reviews"].apply(lambda x: preprocess(x)[3])

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Verified,Reviews,tokens,no_stopwords,stemmed,lemmatized
0,True,Flight was amazing. The crew onboard this fl...,"[flight, was, amazing, ., the, crew, onboard, ...","[flight, amazing, crew, onboard, flight, welco...","[flight, amaz, crew, onboard, flight, welcom, ...","[flight, amazing, crew, onboard, flight, welco..."
1,True,Booking an emergency exit seat still meant h...,"[booking, an, emergency, exit, seat, still, me...","[booking, emergency, exit, seat, still, meant,...","[book, emerg, exit, seat, still, meant, huge, ...","[booking, emergency, exit, seat, still, meant,..."
2,True,Excellent performance on all fronts. I would...,"[excellent, performance, on, all, fronts, ., i...","[excellent, performance, fronts, would, defini...","[excel, perform, front, would, definit, choos,...","[excellent, performance, front, would, definit..."
3,True,Pretty comfortable flight considering I was f...,"[pretty, comfortable, flight, considering, i, ...","[pretty, comfortable, flight, considering, fly...","[pretti, comfort, flight, consid, fli, economi...","[pretty, comfortable, flight, considering, fly..."
4,True,The service was consistently good from start ...,"[the, service, was, consistently, good, from, ...","[service, consistently, good, start, finish, c...","[servic, consist, good, start, finish, cabin, ...","[service, consistently, good, start, finish, c..."


#Comparing Representation Quality

In [13]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
df["tokens"] = df["Reviews"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Reviews"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Reviews"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Reviews"].apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(df["tokens"]),
    "After Stopword Removal": get_vocab_size(df["no_stopwords"]),
    "After Stemming": get_vocab_size(df["stemmed"]),
    "After Lemmatization": get_vocab_size(df["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,13765,10870,7287,9761
