In [66]:
import warnings
warnings.filterwarnings("ignore")
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Sample text
text = "Text preprocessing is an important step in natural language processing. It involves cleaning and transforming raw text data into a format suitable for analysis."

# Convert text to lowercase
text = text.lower()

# Remove punctuation and special characters
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Tokenize the text (split it into words)
tokens = word_tokenize(text)

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Stemming (using Porter stemmer)
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization (using WordNet lemmatizer)
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Join the preprocessed tokens back into a clean text
clean_text = ' '.join(lemmatized_tokens)

# Print the cleaned text
print("Original Text:")
print(text)
print("\nCleaned Text:")
print(clean_text)


Original Text:
text preprocessing is an important step in natural language processing it involves cleaning and transforming raw text data into a format suitable for analysis

Cleaned Text:
text preprocessing important step natural language processing involves cleaning transforming raw text data format suitable analysis


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\UJ415AV\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1.

In [67]:
import nltk
import re

In [68]:
from nltk.corpus import stopwords

In [69]:
from nltk.tokenize import word_tokenize

In [70]:
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [71]:
text = "Text preprocessing is an important step in natural language processing. It involves cleaning and transforming raw text data into a format suitable for analysis."
print(text)

Text preprocessing is an important step in natural language processing. It involves cleaning and transforming raw text data into a format suitable for analysis.


In [72]:
text = text.lower()
print(text)

text preprocessing is an important step in natural language processing. it involves cleaning and transforming raw text data into a format suitable for analysis.


In [73]:
text = re.sub(r"[^a-zA-Z0-9\s]","",text)
print(text)

text preprocessing is an important step in natural language processing it involves cleaning and transforming raw text data into a format suitable for analysis


In [74]:
token = word_tokenize(text)
print(token)

['text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', 'it', 'involves', 'cleaning', 'and', 'transforming', 'raw', 'text', 'data', 'into', 'a', 'format', 'suitable', 'for', 'analysis']


In [75]:
token = [i for i in token if i not in set(stopwords.words("english"))]
print(token)

['text', 'preprocessing', 'important', 'step', 'natural', 'language', 'processing', 'involves', 'cleaning', 'transforming', 'raw', 'text', 'data', 'format', 'suitable', 'analysis']


In [76]:
ps = PorterStemmer()
ps_token = [ps.stem(i) for i in token]
print(ps_token)

['text', 'preprocess', 'import', 'step', 'natur', 'languag', 'process', 'involv', 'clean', 'transform', 'raw', 'text', 'data', 'format', 'suitabl', 'analysi']


In [77]:
lmm = WordNetLemmatizer()
lmm_token = [lmm.lemmatize(i) for i in token]
print(lmm_token)

['text', 'preprocessing', 'important', 'step', 'natural', 'language', 'processing', 'involves', 'cleaning', 'transforming', 'raw', 'text', 'data', 'format', 'suitable', 'analysis']


In [78]:
ps_join = " ".join(ps_token)
print(ps_join)

text preprocess import step natur languag process involv clean transform raw text data format suitabl analysi


In [79]:
lmm_join = " ".join(lmm_token)
print(lmm_join)

text preprocessing important step natural language processing involves cleaning transforming raw text data format suitable analysis


In this code:

1. We start with a sample text that we want to preprocess.

2. We convert the text to lowercase to ensure consistency.

3. We remove punctuation and special characters using regular expressions (re.sub).

4. We tokenize the text into individual words using NLTK's word_tokenize function.

5. We remove common stopwords (e.g., "the," "and," "is") using NLTK's list of English stop words.

6. We perform stemming on the remaining words using the Porter stemmer to reduce them to their base form.

7. We also demonstrate lemmatization using the WordNet lemmatizer, which reduces words to their base or dictionary form.

Finally, we join the preprocessed tokens back together into a clean text.

# 2.

In [89]:
ph = "In a world full of challenges, we often encounter unexpected twists and turns in our journey. Life's path is not always smooth, and it's essential to navigate through the highs and lows with resilience and determination! Embracing each obstacle as an opportunity for growth and learning is the key to achieving success. So, don't be afraid to take risks, dream big, and make your mark on this exciting adventure called life."

In [90]:
ph = re.sub(r"[^a-zA-Z0-9\s]","",ph)
ph

'In a world full of challenges we often encounter unexpected twists and turns in our journey Lifes path is not always smooth and its essential to navigate through the highs and lows with resilience and determination Embracing each obstacle as an opportunity for growth and learning is the key to achieving success So dont be afraid to take risks dream big and make your mark on this exciting adventure called life'

In [91]:
ph = ph.lower()
ph

'in a world full of challenges we often encounter unexpected twists and turns in our journey lifes path is not always smooth and its essential to navigate through the highs and lows with resilience and determination embracing each obstacle as an opportunity for growth and learning is the key to achieving success so dont be afraid to take risks dream big and make your mark on this exciting adventure called life'

In [92]:
ph = word_tokenize(ph)
ph

['in',
 'a',
 'world',
 'full',
 'of',
 'challenges',
 'we',
 'often',
 'encounter',
 'unexpected',
 'twists',
 'and',
 'turns',
 'in',
 'our',
 'journey',
 'lifes',
 'path',
 'is',
 'not',
 'always',
 'smooth',
 'and',
 'its',
 'essential',
 'to',
 'navigate',
 'through',
 'the',
 'highs',
 'and',
 'lows',
 'with',
 'resilience',
 'and',
 'determination',
 'embracing',
 'each',
 'obstacle',
 'as',
 'an',
 'opportunity',
 'for',
 'growth',
 'and',
 'learning',
 'is',
 'the',
 'key',
 'to',
 'achieving',
 'success',
 'so',
 'dont',
 'be',
 'afraid',
 'to',
 'take',
 'risks',
 'dream',
 'big',
 'and',
 'make',
 'your',
 'mark',
 'on',
 'this',
 'exciting',
 'adventure',
 'called',
 'life']

In [97]:
ph = [i for i in ph if i not in stopwords.words('english')]
ph

['world',
 'full',
 'challenges',
 'often',
 'encounter',
 'unexpected',
 'twists',
 'turns',
 'journey',
 'lifes',
 'path',
 'always',
 'smooth',
 'essential',
 'navigate',
 'highs',
 'lows',
 'resilience',
 'determination',
 'embracing',
 'obstacle',
 'opportunity',
 'growth',
 'learning',
 'key',
 'achieving',
 'success',
 'dont',
 'afraid',
 'take',
 'risks',
 'dream',
 'big',
 'make',
 'mark',
 'exciting',
 'adventure',
 'called',
 'life']

In [102]:
ptt = PorterStemmer()
ph_pt = [ptt.stem(i) for i in ph]
ph_pt = " ".join(ph_pt)
print(ph_pt)

world full challeng often encount unexpect twist turn journey life path alway smooth essenti navig high low resili determin embrac obstacl opportun growth learn key achiev success dont afraid take risk dream big make mark excit adventur call life


In [103]:
w_lem = WordNetLemmatizer()
w_lem = [w_lem.lemmatize(i) for i in ph]
w_lem = " ".join(w_lem)
print(w_lem)

world full challenge often encounter unexpected twist turn journey life path always smooth essential navigate high low resilience determination embracing obstacle opportunity growth learning key achieving success dont afraid take risk dream big make mark exciting adventure called life
