<a href="https://colab.research.google.com/github/deepak1195/NaturalLanguageProcessing/blob/main/010_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from numpy.ma.core import shape
path="/content/drive/MyDrive/myWork/Data/"

Mounted at /content/drive


In [3]:
df=pd.read_csv(f'{path}Ecommerce_data.csv')
df

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories
...,...,...
23995,Marvel Physics MCQ's for MHT - CET,Books
23996,Internet Download Manager | Lifetime License |...,Books
23997,Sadhubela's Handcrafted Iron Degchi Handi Pot ...,Household
23998,Audio-Technica AT-LP60 Automatic Belt Driven D...,Electronics


In [4]:
df.label.value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [5]:
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp=spacy.load('en_core_web_sm')
def preprocessText(txt):
  doc=nlp(re.sub(r'[^\w\s]', '',txt))
  clean=[tk.lemma_ for tk in doc if not tk.is_stop]
  return ' '.join(clean).replace(' ,','')

df['Text']=df.Text.apply(preprocessText)
df

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low StudyOffice Computer C...,Household
1,contrast live Wooden Decorative BoxPainted Box...,Household
2,IO Crest SYPCI40010 pci raid Host Controller C...,Electronics
3,ISAKAA Baby Socks bear 8 year Pack 4 6 8 12 IS...,Clothing & Accessories
4,Indira Designer Womens Art Mysore Silk Saree B...,Clothing & Accessories
...,...,...
23995,Marvel Physics mcq MHT CET,Books
23996,internet Download Manager Lifetime License ...,Books
23997,Sadhubelas Handcrafted Iron Degchi Handi Pot ...,Household
23998,AudioTechnica atlp60 Automatic Belt Driven dj ...,Electronics


In [6]:
df['label']=df.label.map({'Household':0,'Electronics':1,'Clothing & Accessories':2,'Books':3})
df

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low StudyOffice Computer C...,0
1,contrast live Wooden Decorative BoxPainted Box...,0
2,IO Crest SYPCI40010 pci raid Host Controller C...,1
3,ISAKAA Baby Socks bear 8 year Pack 4 6 8 12 IS...,2
4,Indira Designer Womens Art Mysore Silk Saree B...,2
...,...,...
23995,Marvel Physics mcq MHT CET,3
23996,internet Download Manager Lifetime License ...,3
23997,Sadhubelas Handcrafted Iron Degchi Handi Pot ...,0
23998,AudioTechnica atlp60 Automatic Belt Driven dj ...,1


In [7]:
from sklearn.model_selection import train_test_split
xTrain,xTest,yTrain,yTest=train_test_split(df.Text, df.label, test_size=0.25, random_state=2023, stratify=df.label)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

model=Pipeline([
    ('TF-IDF',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

model.fit(xTrain,yTrain)

In [9]:
yPred=model.predict(xTest)
yPred

array([3, 0, 0, ..., 0, 1, 1])

In [10]:
from inspect import classify_class_attrs
from sklearn.metrics import classification_report
print(classification_report(yTest,yPred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1500
           1       0.96      0.97      0.97      1500
           2       0.97      0.98      0.98      1500
           3       0.97      0.93      0.95      1500

    accuracy                           0.96      6000
   macro avg       0.96      0.96      0.96      6000
weighted avg       0.96      0.96      0.96      6000



In [11]:
v=model['TF-IDF']
print(v.vocabulary_)



In [12]:
feature_names=v.get_feature_names_out()[10000:10050]
feature_names

array(['belarus', 'belazo', 'belegal', 'belem', 'belgian', 'belgium',
       'belha', 'belief', 'beliefs', 'believable', 'believably',
       'believe', 'believer', 'believers', 'belittling', 'belkin', 'bell',
       'bella', 'bellaigue', 'belle', 'bellevue', 'bellingham', 'bellows',
       'bells', 'bellucci', 'belly', 'belo', 'belomoda', 'belong',
       'belonging', 'belongings', 'belove', 'beloved', 'belowrelaxation',
       'belsius', 'belt', 'belt1', 'belt100', 'belt2', 'beltadvent',
       'beltblack', 'beltcolourblackbrownitalian', 'beltdenim',
       'beltdrive', 'beltdriven', 'beltgift', 'belton', 'beltreversible',
       'belts', 'beltscarf'], dtype=object)

In [13]:
for word in feature_names:
  idx=v.vocabulary_.get(word)
  print(f"{word} --> {idx}")

belarus --> 10000
belazo --> 10001
belegal --> 10002
belem --> 10003
belgian --> 10004
belgium --> 10005
belha --> 10006
belief --> 10007
beliefs --> 10008
believable --> 10009
believably --> 10010
believe --> 10011
believer --> 10012
believers --> 10013
belittling --> 10014
belkin --> 10015
bell --> 10016
bella --> 10017
bellaigue --> 10018
belle --> 10019
bellevue --> 10020
bellingham --> 10021
bellows --> 10022
bells --> 10023
bellucci --> 10024
belly --> 10025
belo --> 10026
belomoda --> 10027
belong --> 10028
belonging --> 10029
belongings --> 10030
belove --> 10031
beloved --> 10032
belowrelaxation --> 10033
belsius --> 10034
belt --> 10035
belt1 --> 10036
belt100 --> 10037
belt2 --> 10038
beltadvent --> 10039
beltblack --> 10040
beltcolourblackbrownitalian --> 10041
beltdenim --> 10042
beltdrive --> 10043
beltdriven --> 10044
beltgift --> 10045
belton --> 10046
beltreversible --> 10047
belts --> 10048
beltscarf --> 10049


In [14]:
df.Text[0:2]

0    Urban Ladder Eisner Low StudyOffice Computer C...
1    contrast live Wooden Decorative BoxPainted Box...
Name: Text, dtype: object

In [15]:
v.fit_transform(df.Text[0:2]).toarray()

array([[0.        , 0.        , 0.12450853, 0.12450853, 0.        ,
        0.12450853, 0.12450853, 0.        , 0.        , 0.        ,
        0.12450853, 0.12450853, 0.12450853, 0.12450853, 0.        ,
        0.        , 0.24901707, 0.12450853, 0.        , 0.        ,
        0.12450853, 0.12450853, 0.        , 0.        , 0.        ,
        0.12450853, 0.08858885, 0.12450853, 0.12450853, 0.12450853,
        0.12450853, 0.24901707, 0.        , 0.12450853, 0.12450853,
        0.        , 0.12450853, 0.12450853, 0.12450853, 0.12450853,
        0.        , 0.12450853, 0.        , 0.        , 0.12450853,
        0.        , 0.12450853, 0.        , 0.12450853, 0.12450853,
        0.        , 0.12450853, 0.12450853, 0.12450853, 0.12450853,
        0.        , 0.        , 0.12450853, 0.12450853, 0.12450853,
        0.        , 0.12450853, 0.12450853, 0.        , 0.12450853,
        0.12450853, 0.12450853, 0.        , 0.12450853, 0.        ,
        0.12450853, 0.        , 0.12450853, 0.  