In [13]:
#part1
sample_doc = "Natural Language Processing (NLP) is a sub-field of artificial intelligence concerned with understanding and processing human language."


In [21]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [22]:
#tokenise
tokens = word_tokenize(sample_doc)
print("Tokens:", tokens)


Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'sub-field', 'of', 'artificial', 'intelligence', 'concerned', 'with', 'understanding', 'and', 'processing', 'human', 'language', '.']


In [23]:
#pos tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('sub-field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('concerned', 'VBN'), ('with', 'IN'), ('understanding', 'JJ'), ('and', 'CC'), ('processing', 'JJ'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


In [24]:
#stop-words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("Filtered Tokens:", filtered_tokens)


Filtered Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'artificial', 'intelligence', 'concerned', 'understanding', 'processing', 'human', 'language']


In [25]:
#stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'artifici', 'intellig', 'concern', 'understand', 'process', 'human', 'languag']


In [26]:
#lemmatizing
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'artificial', 'intelligence', 'concerned', 'understanding', 'processing', 'human', 'language']


In [27]:
#part2
#TF-IDF Representation
documents = [
    "Natural Language Processing helps computers understand human language.",
    "Artificial Intelligence includes machine learning and NLP.",
    "Text preprocessing includes tokenization, stemming, and lemmatization."
]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert to DataFrame for readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)


        and  artificial  computers     helps     human  includes  \
0  0.000000    0.000000   0.316228  0.316228  0.316228  0.000000   
1  0.306504    0.403016   0.000000  0.000000  0.000000  0.306504   
2  0.306504    0.000000   0.000000  0.000000  0.000000  0.306504   

   intelligence  language  learning  lemmatization   machine   natural  \
0      0.000000  0.632456  0.000000       0.000000  0.000000  0.316228   
1      0.403016  0.000000  0.403016       0.000000  0.403016  0.000000   
2      0.000000  0.000000  0.000000       0.403016  0.000000  0.000000   

        nlp  preprocessing  processing  stemming      text  tokenization  \
0  0.000000       0.000000    0.316228  0.000000  0.000000      0.000000   
1  0.403016       0.000000    0.000000  0.000000  0.000000      0.000000   
2  0.000000       0.403016    0.000000  0.403016  0.403016      0.403016   

   understand  
0    0.316228  
1    0.000000  
2    0.000000  
