# 1. What is the purpose of text preprocessing in NLP, and why is it essential before analysis?

In [8]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Tokenization and lemmatization
    doc = nlp(text)
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return processed_tokens

# Example usage
raw_text = "Text mining is the process of extracting valuable information from unstructured text data."

processed_text = preprocess_text(raw_text)

print("Original Text:")
print(raw_text)

print("\nProcessed Text:")
print(processed_text)


Original Text:
Text mining is the process of extracting valuable information from unstructured text data.

Processed Text:
['text', 'mining', 'process', 'extract', 'valuable', 'information', 'unstructured', 'text', 'datum', '.']


# 2. Describe tokenization in NLP and explain its significance in text processing.

In [9]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Example usage
raw_text = "Tokenization is crucial for natural language processing tasks."

tokenized_text = tokenize_text(raw_text)

print("Original Text:")
print(raw_text)

print("\nTokenized Text:")
print(tokenized_text)


Original Text:
Tokenization is crucial for natural language processing tasks.

Tokenized Text:
['Tokenization', 'is', 'crucial', 'for', 'natural', 'language', 'processing', 'tasks', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 3. What are the differences between stemming and lemmatization in NLP? When would you choose one over the other?

In [11]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

def stem_text(text):
    porter = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_words = [porter.stem(word) for word in tokens]
    return stemmed_words

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_words

# Example usage
raw_text = "Stemming and lemmatization are important for NLP tasks."

stemmed_text = stem_text(raw_text)
lemmatized_text = lemmatize_text(raw_text)

print("Original Text:")
print(raw_text)

print("\nStemmed Text:")
print(stemmed_text)

print("\nLemmatized Text:")
print(lemmatized_text)


Original Text:
Stemming and lemmatization are important for NLP tasks.

Stemmed Text:
['stem', 'and', 'lemmat', 'are', 'import', 'for', 'nlp', 'task', '.']

Lemmatized Text:
['Stemming', 'and', 'lemmatization', 'are', 'important', 'for', 'NLP', 'task', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 4. Explain the concept of stop words and their role in text preprocessing. How do they impact NLP tasks?

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_words = [word for word in tokens if word.lower() not in stop_words]
    return filtered_words

# Example usage
raw_text = "Stop words like 'the' and 'is' should be removed for better analysis."

filtered_text = remove_stop_words(raw_text)

print("Original Text:")
print(raw_text)

print("\nText after Removing Stop Words:")
print(filtered_text)


Original Text:
Stop words like 'the' and 'is' should be removed for better analysis.

Text after Removing Stop Words:
['Stop', 'words', 'like', "'the", "'", "'is", "'", 'removed', 'better', 'analysis', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 5. How does the process of removing punctuation contribute to text preprocessing in NLP? What are its benefits?

In [14]:
import string

def remove_punctuation(text):
    # Using string.punctuation to get a string of all punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text_no_punct = text.translate(translator)
    return text_no_punct

# Example usage
raw_text = "Text with punctuation, like commas, can affect NLP tasks."

text_without_punct = remove_punctuation(raw_text)

print("Original Text:")
print(raw_text)

print("\nText after Removing Punctuation:")
print(text_without_punct)


Original Text:
Text with punctuation, like commas, can affect NLP tasks.

Text after Removing Punctuation:
Text with punctuation like commas can affect NLP tasks


# 6. Discuss the importance of lowercase conversion in text preprocessing. Why is it a common step in NLP tasks?

In [15]:
def convert_to_lowercase(text):
    return text.lower()

# Example usage
raw_text = "Text with MiXeD CaSe for NLP analysis."

lowercased_text = convert_to_lowercase(raw_text)

print("Original Text:")
print(raw_text)

print("\nText after Lowercase Conversion:")
print(lowercased_text)


Original Text:
Text with MiXeD CaSe for NLP analysis.

Text after Lowercase Conversion:
text with mixed case for nlp analysis.


# 7. Explain the term "vectorization" concerning text data. How does techniques like CountVectorizer contribute to text preprocessing in NLP?

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize_text(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X, vectorizer.get_feature_names_out()

# Example usage
corpus = ["Text for NLP analysis.", "Another text for vectorization."]

vectorized_matrix, feature_names = vectorize_text(corpus)

print("Original Texts:")
print(corpus)

print("\nVectorized Matrix:")
print(vectorized_matrix.toarray())

print("\nFeature Names:")
print(feature_names)


Original Texts:
['Text for NLP analysis.', 'Another text for vectorization.']

Vectorized Matrix:
[[1 0 1 1 1 0]
 [0 1 1 0 1 1]]

Feature Names:
['analysis' 'another' 'for' 'nlp' 'text' 'vectorization']


# 8 Describe the concept of normalization in NLP. Provide examples of normalization techniques used in text preprocessing

In [18]:
def lowercase(text):
    return text.lower()


In [19]:
# Stemming:
from nltk.stem import PorterStemmer
def stemming(text):
    porter = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_words = [porter.stem(word) for word in tokens]
    return stemmed_words

    

In [20]:
# Lemmatization:
from nltk.stem import WordNetLemmatizer
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_words
