In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer 
import re


In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text) 
    return text

In [4]:
text = "i am a student.hello!! there is a session going onn."
preprocessed_document = preprocess_text(text)
text

'i am a student.hello!! there is a session going onn.'

In [5]:
nltk.download('punkt') 
def tokenize_text(text):
    tokens = word_tokenize(text) 
    return tokens

[nltk_data] Downloading package punkt to /Users/dhruvpai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
tokens = tokenize_text(preprocessed_document)
tokens

['i',
 'am',
 'a',
 'student',
 'hello',
 'there',
 'is',
 'a',
 'session',
 'going',
 'onn']

In [7]:
def pos_tag_tokens(tokens): 
    pos_tags = pos_tag(tokens) 
    return pos_tags

In [8]:
nltk.download('averaged_perceptron_tagger')
pos_tags = pos_tag_tokens(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dhruvpai/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
pos_tags

[('i', 'NN'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('student', 'NN'),
 ('hello', 'NN'),
 ('there', 'EX'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('session', 'NN'),
 ('going', 'VBG'),
 ('onn', 'NN')]

In [10]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [11]:
nltk.download('stopwords')
filtered_tokens = remove_stop_words(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dhruvpai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
filtered_tokens

['student', 'hello', 'session', 'going', 'onn']

In [13]:
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens] 
    return stemmed_tokens

In [14]:
stemmed_tokens = stem_tokens(filtered_tokens)
stemmed_tokens

['student', 'hello', 'session', 'go', 'onn']

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dhruvpai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
def lemmatize_tokens(tokens): 
    lemmatizer = WordNetLemmatizer()
    def get_wordnet_pos(treebank_tag): 
        if treebank_tag.startswith('J'):
            return wn.ADJ
        elif treebank_tag.startswith('V'):
            return wn.VERB
        elif treebank_tag.startswith('N'):
            return wn.NOUN
        elif treebank_tag.startswith('R'):
            return wn.ADV 
        else:
            return None
    pos_tags = pos_tag(tokens) 
    lemmatized_tokens = []
    for word, pos in pos_tags:
        wordnet_pos = get_wordnet_pos(pos) or wn.NOUN
        lemmatized_tokens.append(lemmatizer.lemmatize(word, pos=wordnet_pos)) 
        return lemmatized_tokens

In [17]:
lemmatized_tokens = lemmatize_tokens(tokens)
lemmatized_tokens

['i']

In [18]:
def get_tfidf_representation(documents):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents) 
    return tfidf_matrix

In [19]:
tfidf_matrix = get_tfidf_representation([text])
tfidf_matrix

<1x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [20]:
print("Original Tokens:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nFiltered Tokens after Stop Words Removal:") 
print(filtered_tokens)
print("\nStemmed Tokens:") 
print(stemmed_tokens) 
print("\nLemmatized Tokens:") 
print(lemmatized_tokens) 
print("\nTF-IDF Representation:") 
print(tfidf_matrix)

Original Tokens:
['i', 'am', 'a', 'student', 'hello', 'there', 'is', 'a', 'session', 'going', 'onn']

POS Tagging:
[('i', 'NN'), ('am', 'VBP'), ('a', 'DT'), ('student', 'NN'), ('hello', 'NN'), ('there', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ('session', 'NN'), ('going', 'VBG'), ('onn', 'NN')]

Filtered Tokens after Stop Words Removal:
['student', 'hello', 'session', 'going', 'onn']

Stemmed Tokens:
['student', 'hello', 'session', 'go', 'onn']

Lemmatized Tokens:
['i']

TF-IDF Representation:
  (0, 4)	0.35355339059327373
  (0, 1)	0.35355339059327373
  (0, 5)	0.35355339059327373
  (0, 3)	0.35355339059327373
  (0, 7)	0.35355339059327373
  (0, 2)	0.35355339059327373
  (0, 6)	0.35355339059327373
  (0, 0)	0.35355339059327373


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
def get_bow_representation(documents):
    bow_vectorizer = CountVectorizer()
    bow_matrix = bow_vectorizer.fit_transform(documents)
    return bow_matrix

bow_matrix = get_bow_representation([text])
bow_matrix

print("\nBoW Representation:")
print(bow_matrix)


BoW Representation:
  (0, 0)	1
  (0, 6)	1
  (0, 2)	1
  (0, 7)	1
  (0, 3)	1
  (0, 5)	1
  (0, 1)	1
  (0, 4)	1
