# Example (tf-idf)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# create data set
text = ['The car crashed long ago.',
         'The car has rusted.',
         'A rusted car is unsafe.',
         'Spare car parts are needed urgently.']
corpus = pd.DataFrame()
corpus['document'] = text
corpus

Unnamed: 0,document
0,The car crashed long ago.
1,The car has rusted.
2,A rusted car is unsafe.
3,Spare car parts are needed urgently.


In [3]:
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words= 'english', norm = 'l1')

In [4]:
word_tfidf = vectorizer.fit_transform(corpus.document.values)
print(round(word_tfidf,2))

  (0, 0)	0.28
  (0, 1)	0.15
  (0, 2)	0.28
  (0, 3)	0.28
  (1, 1)	0.4
  (1, 6)	0.6
  (2, 1)	0.23
  (2, 6)	0.34
  (2, 8)	0.43
  (3, 1)	0.12
  (3, 4)	0.22
  (3, 5)	0.22
  (3, 7)	0.22
  (3, 9)	0.22


In [5]:
word_tfidf.shape

(4, 10)

In [6]:
word_tfidf.toarray()

array([[0.28394236, 0.14817291, 0.28394236, 0.28394236, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.39827537, 0.        , 0.        , 0.        ,
        0.        , 0.60172463, 0.        , 0.        , 0.        ],
       [0.        , 0.22588067, 0.        , 0.        , 0.        ,
        0.        , 0.3412663 , 0.        , 0.43285303, 0.        ],
       [0.        , 0.11540464, 0.        , 0.        , 0.22114884,
        0.22114884, 0.        , 0.22114884, 0.        , 0.22114884]])

In [7]:
words = pd.Series(vectorizer.get_feature_names())
words

0         ago
1         car
2     crashed
3        long
4      needed
5       parts
6      rusted
7       spare
8      unsafe
9    urgently
dtype: object

In [8]:
word_idf = pd.Series(vectorizer.idf_,index=words)
word_idf.round(2)

ago         1.92
car         1.00
crashed     1.92
long        1.92
needed      1.92
parts       1.92
rusted      1.51
spare       1.92
unsafe      1.92
urgently    1.92
dtype: float64

In [9]:
df = pd.DataFrame(word_tfidf.toarray(),columns=words)
df.round(2)
#normalize -> divide the sum of a row

Unnamed: 0,ago,car,crashed,long,needed,parts,rusted,spare,unsafe,urgently
0,0.28,0.15,0.28,0.28,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.4,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0
2,0.0,0.23,0.0,0.0,0.0,0.0,0.34,0.0,0.43,0.0
3,0.0,0.12,0.0,0.0,0.22,0.22,0.0,0.22,0.0,0.22


In [10]:
df.sum(axis=1)

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64

In [11]:
df.T.corr() #doc 0 and 3 are correlated

Unnamed: 0,0,1,2,3
0,1.0,-0.156122,-0.325606,-0.620527
1,-0.156122,1.0,0.59649,-0.252408
2,-0.325606,0.59649,1.0,-0.443701
3,-0.620527,-0.252408,-0.443701,1.0
