In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize


In [4]:
# Download NLTK dependencies
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to C:\Users\aarya
[nltk_data]     admane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\aarya
[nltk_data]     admane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aarya admane/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\aarya
[nltk_data]     admane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Sample dataset
data = {
    'Text': [
        "Natural Language Processing is a growing field of AI.",
        "Preprocessing includes tokenization, stopwords removal and stemming.",
        "TF-IDF gives weight to important words in text.",
        np.nan,  # Introduce a null value intentionally
        "     "  # Outlier - whitespace only
    ]
}

df = pd.DataFrame(data)


In [6]:

# Remove nulls or empty strings
df['Text'] = df['Text'].replace(r'^\s*$', np.nan, regex=True)
df.dropna(inplace=True)

In [7]:

### 2. Basic Cleaning - Remove whitespace entries ###
df['Text'] = df['Text'].str.strip()
df = df[df['Text'] != '']

In [8]:

### 3. Tokenization ###
df['Tokens'] = df['Text'].apply(word_tokenize)

In [9]:

### 4. Stop Word Removal ###
stop_words = set(stopwords.words('english'))
df['Tokens_NoStop'] = df['Tokens'].apply(lambda x: [word.lower() for word in x if word.lower() not in stop_words and word.isalpha()])

In [10]:

### 5. POS Tagging ###
df['POS'] = df['Tokens_NoStop'].apply(pos_tag)

In [11]:

### 6. Stemming and Lemmatization ###
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [12]:

df['Stemmed'] = df['Tokens_NoStop'].apply(lambda x: [stemmer.stem(word) for word in x])
df['Lemmatized'] = df['Tokens_NoStop'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [13]:
### 7. TF-IDF + Normalization ###
# Rejoin lemmatized tokens for vectorizer
df['Clean_Text'] = df['Lemmatized'].apply(lambda x: ' '.join(x))


In [14]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Clean_Text'])

In [15]:
# Normalizing the TF-IDF matrix
normalized_tfidf = normalize(tfidf_matrix, norm='l2')

In [16]:

# Convert to DataFrame
tfidf_df = pd.DataFrame(normalized_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [17]:

### 🧾 Final Output ###
print("\n Preprocessed Text DataFrame:")
print(df[['Text', 'Tokens_NoStop', 'Lemmatized']])



 Preprocessed Text DataFrame:
                                                Text  \
0  Natural Language Processing is a growing field...   
1  Preprocessing includes tokenization, stopwords...   
2    TF-IDF gives weight to important words in text.   

                                       Tokens_NoStop  \
0  [natural, language, processing, growing, field...   
1  [preprocessing, includes, tokenization, stopwo...   
2            [gives, weight, important, words, text]   

                                          Lemmatized  
0  [natural, language, processing, growing, field...  
1  [preprocessing, includes, tokenization, stopwo...  
2              [give, weight, important, word, text]  


In [18]:

print("\n Normalized TF-IDF Matrix:")
print(tfidf_df.round(2))



 Normalized TF-IDF Matrix:
     ai  field  give  growing  important  includes  language  natural  \
0  0.41   0.41  0.00     0.41       0.00      0.00      0.41     0.41   
1  0.00   0.00  0.00     0.00       0.00      0.41      0.00     0.00   
2  0.00   0.00  0.45     0.00       0.45      0.00      0.00     0.00   

   preprocessing  processing  removal  stemming  stopwords  text  \
0           0.00        0.41     0.00      0.00       0.00  0.00   
1           0.41        0.00     0.41      0.41       0.41  0.00   
2           0.00        0.00     0.00      0.00       0.00  0.45   

   tokenization  weight  word  
0          0.00    0.00  0.00  
1          0.41    0.00  0.00  
2          0.00    0.45  0.45  


In [19]:
### 🧾 Final Output ###
print("\n Preprocessed Text DataFrame:")
print(df[['Text', 'Tokens_NoStop', 'Lemmatized', 'POS']])

print("\n Normalized TF-IDF Matrix:")
print(tfidf_df.round(2))



 Preprocessed Text DataFrame:
                                                Text  \
0  Natural Language Processing is a growing field...   
1  Preprocessing includes tokenization, stopwords...   
2    TF-IDF gives weight to important words in text.   

                                       Tokens_NoStop  \
0  [natural, language, processing, growing, field...   
1  [preprocessing, includes, tokenization, stopwo...   
2            [gives, weight, important, words, text]   

                                          Lemmatized  \
0  [natural, language, processing, growing, field...   
1  [preprocessing, includes, tokenization, stopwo...   
2              [give, weight, important, word, text]   

                                                 POS  
0  [(natural, JJ), (language, NN), (processing, V...  
1  [(preprocessing, VBG), (includes, VBZ), (token...  
2  [(gives, VBZ), (weight, RBS), (important, JJ),...  

 Normalized TF-IDF Matrix:
     ai  field  give  growing  important  incl