<a href="https://colab.research.google.com/github/gulabpatel/NLP_Basics/blob/main/TF_IDF_and_Count_Vectorizer_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [2]:
sentences=['I have a credit card account','My account card, debit card is lost','My credit card stopped working']
sentences

['I have a credit card account',
 'My account card, debit card is lost',
 'My credit card stopped working']

In [5]:
vectorizer=CountVectorizer()
countvec=vectorizer.fit_transform(sentences)

In [None]:
countvec.A #prints the matrix of count

array([[1, 1, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 2, 0, 1, 0, 1, 1, 1, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 1, 1]])

In [6]:
vectorizer.get_feature_names()

['account',
 'card',
 'credit',
 'debit',
 'have',
 'is',
 'lost',
 'my',
 'stopped',
 'working']

In [7]:
vectorizer=CountVectorizer(max_features=4)
countvec=vectorizer.fit_transform(sentences)

In [8]:
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 0]
 [1 2 0 1]
 [0 1 1 1]]
['account', 'card', 'credit', 'my']


In [9]:
vectorizer=CountVectorizer(max_features=4, stop_words='english')
countvec=vectorizer.fit_transform(sentences)

In [10]:
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 0]
 [1 2 0 1]
 [0 1 1 0]]
['account', 'card', 'credit', 'debit']


In [11]:
vectorizer=CountVectorizer(max_features=6, ngram_range=(1,2))
countvec=vectorizer.fit_transform(sentences)

In [12]:
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 1 0 0]
 [1 2 0 0 1 0]
 [0 1 1 1 1 1]]
['account', 'card', 'credit', 'credit card', 'my', 'stopped']


##TF = (# occurrences of term t in document) / (# of words in document)


In [13]:
vectorizer=TfidfVectorizer(use_idf=False, norm='l1') #use_idf=False means you are allowing only TF
tfvec=vectorizer.fit_transform(sentences)

In [14]:
print(tfvec.A)
print(vectorizer.get_feature_names())

[[0.25       0.25       0.25       0.         0.25       0.
  0.         0.         0.         0.        ]
 [0.14285714 0.28571429 0.         0.14285714 0.         0.14285714
  0.14285714 0.14285714 0.         0.        ]
 [0.         0.2        0.2        0.         0.         0.
  0.         0.2        0.2        0.2       ]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [32]:
print(1/4), print(1/7), print(1/5)  #l1 norm calculation

0.25
0.14285714285714285
0.2


(None, None, None)

sentences=['I have a credit card account','My card, debit card account','My credit card is not working']

In [19]:
vectorizer=TfidfVectorizer(use_idf=False, norm='l2')
tfvec=vectorizer.fit_transform(sentences)

In [20]:
print(tfvec.A)
print(vectorizer.get_feature_names())

[[0.5        0.5        0.5        0.         0.5        0.
  0.         0.         0.         0.        ]
 [0.33333333 0.66666667 0.         0.33333333 0.         0.33333333
  0.33333333 0.33333333 0.         0.        ]
 [0.         0.4472136  0.4472136  0.         0.         0.
  0.         0.4472136  0.4472136  0.4472136 ]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [26]:
print(1/np.sqrt(4)), print(1/np.sqrt(9)), print(2/np.sqrt(9))  #l2 norm calculation

0.5
0.3333333333333333
0.6666666666666666


(None, None, None)

In [21]:
vectorizer=TfidfVectorizer(use_idf=False, norm=None) #It will work as countvectorizer
tfvec=vectorizer.fit_transform(sentences)

In [22]:
print(tfvec.A)
print(vectorizer.get_feature_names())

[[1. 1. 1. 0. 1. 0. 0. 0. 0. 0.]
 [1. 2. 0. 1. 0. 1. 1. 1. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 1. 1.]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


IDF = log(# of documents / # documents with term t in it)

In [33]:
vectorizer_idf=TfidfVectorizer(smooth_idf=False) #smooth_idf=False means it add 1 in log formula, e.g (log(#doc/#count)+1)
tfidfvec=vectorizer_idf.fit_transform(sentences)

In [34]:
print(vectorizer_idf.idf_)
print(vectorizer_idf.get_feature_names())

[1.40546511 1.         1.40546511 2.09861229 2.09861229 2.09861229
 2.09861229 1.40546511 2.09861229 2.09861229]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


sentences=['I have a credit card account','My card, debit card account','My credit card is not working']

In [35]:
print(np.log(3/2)+1)

1.4054651081081644


In [36]:
tfidfvec.A

array([[0.45951737, 0.3269504 , 0.45951737, 0.        , 0.68614212,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.3055129 , 0.43474989, 0.        , 0.45618573, 0.        ,
        0.45618573, 0.45618573, 0.3055129 , 0.        , 0.        ],
       [0.        , 0.26959162, 0.37890161, 0.        , 0.        ,
        0.        , 0.        , 0.37890161, 0.56576828, 0.56576828]])

-----