In [None]:
!pip install spacy


In [1]:
 # Step 1: Importing Libraries
import spacy
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer ,CountVectorizer



In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = "Hi I am Eating a Mango which is very tasty"
print("Original Document:")
print(doc)

Original Document:
Hi I am Eating a Mango which is very tasty


In [4]:

doc_spacy = nlp(doc)
tokens = [token.text for token in doc_spacy]
print("\nTokenized Words:")
print(tokens)


Tokenized Words:
['Hi', 'I', 'am', 'Eating', 'a', 'Mango', 'which', 'is', 'very', 'tasty']


In [5]:

pos_tags = [(token.text, token.pos_) for token in doc_spacy]
print("\nPOS Tagging:")
print(pos_tags)


POS Tagging:
[('Hi', 'INTJ'), ('I', 'PRON'), ('am', 'AUX'), ('Eating', 'VERB'), ('a', 'DET'), ('Mango', 'PROPN'), ('which', 'PRON'), ('is', 'AUX'), ('very', 'ADV'), ('tasty', 'ADJ')]


In [6]:

filtered_tokens = [token.text for token in doc_spacy if not token.is_stop and token.text not in string.punctuation]
print("\nAfter Stop Words Removal:")
print(filtered_tokens)


After Stop Words Removal:
['Hi', 'Eating', 'Mango', 'tasty']


In [7]:
stemmed_tokens = [token.lemma_ for token in doc_spacy]
print("\nAfter Stemming (Using Lemmatization as SpaCy doesn’t have direct stemming):")
print(stemmed_tokens)


After Stemming (Using Lemmatization as SpaCy doesn’t have direct stemming):
['hi', 'I', 'be', 'eat', 'a', 'Mango', 'which', 'be', 'very', 'tasty']


In [8]:
lemmatized_tokens = [token.lemma_ for token in doc_spacy if not token.is_stop and token.text not in string.punctuation]
print("\nAfter Lemmatization:")
print(lemmatized_tokens)


After Lemmatization:
['hi', 'eat', 'Mango', 'tasty']


In [9]:
corpus = [doc]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

In [10]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df)


TF-IDF Representation:
         am    eating        hi        is     mango     tasty      very  \
0  0.353553  0.353553  0.353553  0.353553  0.353553  0.353553  0.353553   

      which  
0  0.353553  


In [11]:
corpus = [doc]
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(corpus)

In [12]:
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
print("\nCountVectorizer Representation:")
print(count_df)


CountVectorizer Representation:
   am  eating  hi  is  mango  tasty  very  which
0   1       1   1   1      1      1     1      1
