<a href="https://colab.research.google.com/github/deoprakash/NLP_Tutorial/blob/main/BagOfWords_TF_IDF_POS_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
texts = [
    "I love natural language processing",
    "Language processing is a part of AI",
    "I love machine learning and NLP"
]

In [3]:
bow = CountVectorizer()
bow_matrix = bow.fit_transform(texts)

In [4]:
print("Features Names: \n", bow.get_feature_names_out())
print("BoW Matrix: \n", bow_matrix.toarray())

Features Names: 
 ['ai' 'and' 'is' 'language' 'learning' 'love' 'machine' 'natural' 'nlp'
 'of' 'part' 'processing']
BoW Matrix: 
 [[0 0 0 1 0 1 0 1 0 0 0 1]
 [1 0 1 1 0 0 0 0 0 1 1 1]
 [0 1 0 0 1 1 1 0 1 0 0 0]]


In [5]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)

In [6]:
print("Features Names: \n", tfidf.get_feature_names_out())
print("tfidf Matrix: \n", tfidf_matrix.toarray())

Features Names: 
 ['ai' 'and' 'is' 'language' 'learning' 'love' 'machine' 'natural' 'nlp'
 'of' 'part' 'processing']
tfidf Matrix: 
 [[0.         0.         0.         0.45985353 0.         0.45985353
  0.         0.60465213 0.         0.         0.         0.45985353]
 [0.44036207 0.         0.44036207 0.3349067  0.         0.
  0.         0.         0.         0.44036207 0.44036207 0.3349067 ]
 [0.         0.46735098 0.         0.         0.46735098 0.35543247
  0.46735098 0.         0.46735098 0.         0.         0.        ]]


## **POS Tagging & Named Entity Recognition (NER)**

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk founded SpaceX in California. He now leads Tesla Inc.")

In [8]:
#POS Tagging
print("\nPart-of-Speech Tagging:")
for token in doc:
  print(f"{token.text} -> {token.pos_}")


Part-of-Speech Tagging:
Elon -> PROPN
Musk -> PROPN
founded -> VERB
SpaceX -> PROPN
in -> ADP
California -> PROPN
. -> PUNCT
He -> PRON
now -> ADV
leads -> VERB
Tesla -> PROPN
Inc. -> PROPN


In [9]:
#Named Entity Recognition

print("\nNamed Entities:")
for ent in doc.ents:
  print(f"{ent.text} -> {ent.label_}")


Named Entities:
Elon Musk -> PERSON
California -> GPE
Tesla Inc. -> ORG


# **Text Classification (Sentiment Analysis) (Sample Example)**

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import make_pipeline

In [11]:

# Expanded dataset to include better representation of negative sentiment with negation
X = [
    "I love this product",
    "This is the best phone I've used",
    "Absolutely terrible, waste of money",
    "Not good, very disappointed",
    "I am happy with my purchase",
    "Worst experience ever",
    "I am not happy",
    "I do not like this phone",
    "I hate this",
    "Awful quality, not recommended",
    "Not worth the money",
    "Amazing experience, highly recommended",
    "Very satisfied with this product",
    "This is not what I expected",
    "Completely dissatisfied, not good"
]

y = [
    "positive", "positive", "negative", "negative",
    "positive", "negative", "negative", "negative",
    "negative", "negative", "negative", "positive",
    "positive", "negative", "negative"
]

In [12]:
# TF-IDF + bigrams
model = make_pipeline(CountVectorizer(ngram_range=(1, 3)), MultinomialNB())
model.fit(X, y)

In [13]:
# Test inputs
test_1 = "I am not happy."
test_2 = "This phone is good for buying"

print("Sentence:", test_1)
print("Prediction:", model.predict([test_1])[0])  # should be negative

print("\nSentence:", test_2)
print("Prediction:", model.predict([test_2])[0])  # should be positive

Sentence: I am not happy.
Prediction: negative

Sentence: This phone is good for buying
Prediction: negative
