<a href="https://colab.research.google.com/github/ghassenov/NLP_Basics/blob/main/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TF-IDF
* TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used in natural language processing (NLP) and information retrieval to evaluate how important a word is to a document in a collection (corpus). Unlike simple word counts (like Bag-of-Words), TF-IDF weights words based on their rarity and relevance in the corpus.



In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

In [2]:
corpus = [
    "Artificial intelligence is transforming modern industries.",
    "Pizza with extra cheese is my favorite comfort food.",
    "The stock market reached an all-time high yesterday.",
    "She enjoys hiking in the mountains during summer.",
    "Python and R are popular languages for data science.",
    "The concert was canceled due to heavy rain.",
    "Electric vehicles are becoming more affordable.",
    "His latest novel became an instant bestseller.",
    "The team celebrated their victory with a grand party."
]

In [3]:
v = TfidfVectorizer()

In [4]:
v.fit(corpus)

In [5]:
transform_output = v.transform(corpus)

In [6]:
print(v.vocabulary_)

{'artificial': 5, 'intelligence': 31, 'is': 32, 'transforming': 56, 'modern': 36, 'industries': 29, 'pizza': 42, 'with': 60, 'extra': 19, 'cheese': 11, 'my': 39, 'favorite': 20, 'comfort': 12, 'food': 21, 'the': 52, 'stock': 49, 'market': 35, 'reached': 46, 'an': 2, 'all': 1, 'time': 54, 'high': 25, 'yesterday': 61, 'she': 48, 'enjoys': 18, 'hiking': 26, 'in': 28, 'mountains': 38, 'during': 16, 'summer': 50, 'python': 44, 'and': 3, 'are': 4, 'popular': 43, 'languages': 33, 'for': 22, 'data': 14, 'science': 47, 'concert': 13, 'was': 59, 'canceled': 9, 'due': 15, 'to': 55, 'heavy': 24, 'rain': 45, 'electric': 17, 'vehicles': 57, 'becoming': 7, 'more': 37, 'affordable': 0, 'his': 27, 'latest': 34, 'novel': 40, 'became': 6, 'instant': 30, 'bestseller': 8, 'team': 51, 'celebrated': 10, 'their': 53, 'victory': 58, 'grand': 23, 'party': 41}


In [7]:
transform_output.shape

(9, 62)

In [14]:
#printing the idf for each word
all_features_names = v.get_feature_names_out()

In [19]:

for word in all_features_names:
  idx = v.vocabulary_.get(word)
  # get the score
  idf_score = v.idf_[idx]
  print(f"{word}: {idf_score}")

affordable: 2.6094379124341005
all: 2.6094379124341005
an: 2.203972804325936
and: 2.6094379124341005
are: 2.203972804325936
artificial: 2.6094379124341005
became: 2.6094379124341005
becoming: 2.6094379124341005
bestseller: 2.6094379124341005
canceled: 2.6094379124341005
celebrated: 2.6094379124341005
cheese: 2.6094379124341005
comfort: 2.6094379124341005
concert: 2.6094379124341005
data: 2.6094379124341005
due: 2.6094379124341005
during: 2.6094379124341005
electric: 2.6094379124341005
enjoys: 2.6094379124341005
extra: 2.6094379124341005
favorite: 2.6094379124341005
food: 2.6094379124341005
for: 2.6094379124341005
grand: 2.6094379124341005
heavy: 2.6094379124341005
high: 2.6094379124341005
hiking: 2.6094379124341005
his: 2.6094379124341005
in: 2.6094379124341005
industries: 2.6094379124341005
instant: 2.6094379124341005
intelligence: 2.6094379124341005
is: 2.203972804325936
languages: 2.6094379124341005
latest: 2.6094379124341005
market: 2.6094379124341005
modern: 2.6094379124341005
mor

In [20]:
# print the transformed output from tf-idf
print(transform_output.toarray())

[[0.         0.         0.         0.         0.         0.41836331
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.41836331
  0.         0.41836331 0.35335632 0.         0.         0.
  0.41836331 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.41836331 0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.34448466
  0.34448466 0.         0.         0.         0.         0.
  0.         0.34448466 0.34448466 0.34448466 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.29095723 0.         0.   

In [23]:
df = pd.read_csv('/content/Ecommerce_data.csv')

In [24]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [25]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing & Accessories,6000
Books,6000


In [26]:
df.shape

(24000, 2)

In [29]:
df['label_num'] = df['label'].map({
    'Household': 0,
    'Electronics': 1,
    'Clothing & Accessories':	2,
    'Books': 3
})

In [30]:
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


### Train Test Split

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.Text,df.label_num,test_size=0.2,random_state=42)

In [33]:
print(len(X_train))
print(len(X_test))

19200
4800


In [34]:
v = TfidfVectorizer()

In [36]:
X_train_tf = v.fit_transform(X_train)

In [37]:
X_test_tf = v.transform(X_test)

Creating a classification algorithm

In [38]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train_tf,y_train)

In [39]:
y_pred = clf.predict(X_test_tf)

In [40]:
from sklearn.metrics import classification_report

In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      1221
           1       0.96      0.95      0.95      1145
           2       0.97      0.96      0.97      1210
           3       0.95      0.97      0.96      1224

    accuracy                           0.95      4800
   macro avg       0.95      0.95      0.95      4800
weighted avg       0.95      0.95      0.95      4800

