# Cleaning and Preprocessing Text Data in Pandas for NLP Tasks

### Load the Data into a Pandas DataFrame


In [1]:
import pandas as pd
data = {'text': ["I love cooking!", "Baking is fun", None, "Japanese cuisine is great!"]}
df = pd.DataFrame(data)
print(df)

                         text
0             I love cooking!
1               Baking is fun
2                        None
3  Japanese cuisine is great!


### Handle Missing Values


In [2]:
df.dropna(subset=['text'], inplace=True)
print(df)

                         text
0             I love cooking!
1               Baking is fun
3  Japanese cuisine is great!


In [3]:
### Normalize the Text to Make it Consistent

In [4]:
df['text'] = df['text'].str.lower()
print(df)

                         text
0             i love cooking!
1               baking is fun
3  japanese cuisine is great!


### Remove Noise

In [5]:
import re
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(df)

                        text
0             i love cooking
1              baking is fun
3  japanese cuisine is great


### Tokenize the Text

In [6]:
df['tokens'] = df['text'].str.split()
print(df)

                        text                          tokens
0             i love cooking              [i, love, cooking]
1              baking is fun               [baking, is, fun]
3  japanese cuisine is great  [japanese, cuisine, is, great]


### Remove Stop Words

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print(df['tokens'])

0               [love, cooking]
1                 [baking, fun]
3    [japanese, cuisine, great]
Name: tokens, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming and Lemmatization

In [8]:
from nltk.stem import PorterStemmer
nltk.download('wordnet')
stemmer = PorterStemmer()
df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
print(df[['tokens','stemmed']])

                       tokens                   stemmed
0             [love, cooking]              [love, cook]
1               [baking, fun]               [bake, fun]
3  [japanese, cuisine, great]  [japanes, cuisin, great]


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Convert Text into Numerical Representations

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
df['text'] = df['tokens'].apply(lambda x: ' '.join(x))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
print(X.toarray())

[[0.         0.70710678 0.         0.         0.         0.
  0.70710678]
 [0.70710678 0.         0.         0.70710678 0.         0.
  0.        ]
 [0.         0.         0.57735027 0.         0.57735027 0.57735027
  0.        ]]
