**Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

In [1]:
!pip install nltk pandas



In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
df = pd.read_csv("coffee dataset.csv")
df = df.dropna()   # take 3000 rows for speed
df.head()


Unnamed: 0,name,rating,desc_1
0,“Sweety” Espresso Blend,95,"Evaluated as espresso. Sweet-toned, deeply ric..."
1,Flora Blend Espresso,94,"Evaluated as espresso. Sweetly tart, floral-to..."
2,Ethiopia Shakiso Mormora,92,"Crisply sweet, cocoa-toned. Lemon blossom, roa..."
3,Ethiopia Suke Quto,92,"Delicate, sweetly spice-toned. Pink peppercorn..."
4,Ethiopia Gedeb Halo Beriti,94,"Deeply sweet, subtly pungent. Honey, pear, tan..."


In [5]:
df = pd.read_csv("coffee dataset.csv", usecols=['rating', 'desc_1'], engine='python', on_bad_lines='skip')
df = df.dropna()  # take 3000 rows for speed
df.head()

Unnamed: 0,rating,desc_1
0,95,"Evaluated as espresso. Sweet-toned, deeply ric..."
1,94,"Evaluated as espresso. Sweetly tart, floral-to..."
2,92,"Crisply sweet, cocoa-toned. Lemon blossom, roa..."
3,92,"Delicate, sweetly spice-toned. Pink peppercorn..."
4,94,"Deeply sweet, subtly pungent. Honey, pear, tan..."


Data Preprocessing

In [6]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [9]:
import nltk
nltk.download('punkt_tab')

df["tokens"] = df["desc_1"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["desc_1"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["desc_1"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["desc_1"].apply(lambda x: preprocess(x)[3])

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,rating,desc_1,tokens,no_stopwords,stemmed,lemmatized
0,95,"Evaluated as espresso. Sweet-toned, deeply ric...","[evaluated, as, espresso, ., sweet-toned, ,, d...","[evaluated, espresso, deeply, rich, chocolaty,...","[evalu, espresso, deepli, rich, chocolati, van...","[evaluated, espresso, deeply, rich, chocolaty,..."
1,94,"Evaluated as espresso. Sweetly tart, floral-to...","[evaluated, as, espresso, ., sweetly, tart, ,,...","[evaluated, espresso, sweetly, tart, honeysuck...","[evalu, espresso, sweetli, tart, honeysuckl, o...","[evaluated, espresso, sweetly, tart, honeysuck..."
2,92,"Crisply sweet, cocoa-toned. Lemon blossom, roa...","[crisply, sweet, ,, cocoa-toned, ., lemon, blo...","[crisply, sweet, lemon, blossom, roasted, caca...","[crispli, sweet, lemon, blossom, roast, cacao,...","[crisply, sweet, lemon, blossom, roasted, caca..."
3,92,"Delicate, sweetly spice-toned. Pink peppercorn...","[delicate, ,, sweetly, spice-toned, ., pink, p...","[delicate, sweetly, pink, peppercorn, date, my...","[delic, sweetli, pink, peppercorn, date, myrrh...","[delicate, sweetly, pink, peppercorn, date, my..."
4,94,"Deeply sweet, subtly pungent. Honey, pear, tan...","[deeply, sweet, ,, subtly, pungent, ., honey, ...","[deeply, sweet, subtly, pungent, honey, pear, ...","[deepli, sweet, subtli, pungent, honey, pear, ...","[deeply, sweet, subtly, pungent, honey, pear, ..."


Comparing Representation Quality

In [10]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
df["tokens"] = df["desc_1"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["desc_1"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["desc_1"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["desc_1"].apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(df["tokens"]),
    "After Stopword Removal": get_vocab_size(df["no_stopwords"]),
    "After Stemming": get_vocab_size(df["stemmed"]),
    "After Lemmatization": get_vocab_size(df["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,1709,1328,1085,1274
