# Statistical Embedding
CREATED BY: dsl  
[UPDATED: 2020.08.16]

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## 1. BOW using sklearn

In [2]:
# given the following sentence
docA = 'The car is driven on the road'
docB = 'The truck is driven on the highway'
docC = 'The bus is driven on the road and the highway'

In [3]:
# make a list of document from all sentence
doc = [docA, docB, docC]

# initialize countvectorizer
vectorizer = CountVectorizer(stop_words='english')
wordcount = vectorizer.fit_transform(doc)
print(wordcount)

  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (1, 2)	1
  (1, 5)	1
  (1, 3)	1
  (2, 2)	1
  (2, 4)	1
  (2, 3)	1
  (2, 0)	1


In [4]:
feature_names = vectorizer.get_feature_names()
print(feature_names)
wordcount_arr = wordcount.toarray()
print(wordcount_arr)

['bus', 'car', 'driven', 'highway', 'road', 'truck']
[[0 1 1 0 1 0]
 [0 0 1 1 0 1]
 [1 0 1 1 1 0]]


In [5]:
df_bow = pd.DataFrame(wordcount_arr.T, index=feature_names, columns=[docA, docB, docC])
df_bow.style

Unnamed: 0,The car is driven on the road,The truck is driven on the highway,The bus is driven on the road and the highway
bus,0,0,1
car,1,0,0
driven,1,1,1
highway,0,1,1
road,1,0,1
truck,0,1,0


## 2. TF-IDF (Term frequency-inverse document dependency Frequency)

### 2.1 TF

In [6]:
# given the following documents
docA = 'The car is driven on the road'
docB = 'The truck is driven on the highway'

In [7]:
# turn into bow => list of words
bowA = docA.split(' ')
bowB = docB.split(' ')
print('bowA: ', bowA)
print('bowB: ', bowB)

bowA:  ['The', 'car', 'is', 'driven', 'on', 'the', 'road']
bowB:  ['The', 'truck', 'is', 'driven', 'on', 'the', 'highway']


In [8]:
# find unique val of words
wordset = set(bowA).union(set(bowB))
print(wordset)

{'driven', 'road', 'highway', 'is', 'truck', 'car', 'on', 'The', 'the'}


In [9]:
# word count per document
wordDictA = dict.fromkeys(wordset, 0)
wordDictB = dict.fromkeys(wordset, 0)
print('initalize:')
print('wordDictA: ', wordDictA)
print('wordDictB: ', wordDictB)

# count
for word in bowA:
    wordDictA[word]+=1

for word in bowB:
    wordDictB[word]+=1
print('\n Count:')
print('wordDictA: ', wordDictA)
print('wordDictB: ', wordDictB)

initalize:
wordDictA:  {'driven': 0, 'road': 0, 'highway': 0, 'is': 0, 'truck': 0, 'car': 0, 'on': 0, 'The': 0, 'the': 0}
wordDictB:  {'driven': 0, 'road': 0, 'highway': 0, 'is': 0, 'truck': 0, 'car': 0, 'on': 0, 'The': 0, 'the': 0}

 Count:
wordDictA:  {'driven': 1, 'road': 1, 'highway': 0, 'is': 1, 'truck': 0, 'car': 1, 'on': 1, 'The': 1, 'the': 1}
wordDictB:  {'driven': 1, 'road': 0, 'highway': 1, 'is': 1, 'truck': 1, 'car': 0, 'on': 1, 'The': 1, 'the': 1}


In [10]:
# change to dataframe
df_wordcnt = pd.DataFrame([wordDictA, wordDictB]).T
df_wordcnt.columns = [docA, docB]
df_wordcnt.style

Unnamed: 0,The car is driven on the road,The truck is driven on the highway
driven,1,1
road,1,0
highway,0,1
is,1,1
truck,0,1
car,1,0
on,1,1
The,1,1
the,1,1


In [11]:
def computeTF(wordDict, bow):
    tfDict={}
    bowCnt = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCnt)
    return tfDict

In [12]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

print('tfBowA:\n', tfBowA)
print('\ntfBowB:\n', tfBowB)

tfBowA:
 {'driven': 0.14285714285714285, 'road': 0.14285714285714285, 'highway': 0.0, 'is': 0.14285714285714285, 'truck': 0.0, 'car': 0.14285714285714285, 'on': 0.14285714285714285, 'The': 0.14285714285714285, 'the': 0.14285714285714285}

tfBowB:
 {'driven': 0.14285714285714285, 'road': 0.0, 'highway': 0.14285714285714285, 'is': 0.14285714285714285, 'truck': 0.14285714285714285, 'car': 0.0, 'on': 0.14285714285714285, 'The': 0.14285714285714285, 'the': 0.14285714285714285}


In [13]:
df_tf = pd.DataFrame([tfBowA, tfBowB]).T
df_tf.columns=['tfBowA', 'tfBowB']
df_tf.style

Unnamed: 0,tfBowA,tfBowB
driven,0.142857,0.142857
road,0.142857,0.0
highway,0.0,0.142857
is,0.142857,0.142857
truck,0.0,0.142857
car,0.142857,0.0
on,0.142857,0.142857
The,0.142857,0.142857
the,0.142857,0.142857


### 2.2. IDF

In [14]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [15]:
idfs = computeIDF([wordDictA, wordDictB])
print(idfs)

{'driven': 0.0, 'road': 0.3010299956639812, 'highway': 0.3010299956639812, 'is': 0.0, 'truck': 0.3010299956639812, 'car': 0.3010299956639812, 'on': 0.0, 'The': 0.0, 'the': 0.0}


In [16]:
df_idf = pd.DataFrame([idfs]).T
df_idf.columns=['IDF']
df_idf.style

Unnamed: 0,IDF
driven,0.0
road,0.30103
highway,0.30103
is,0.0
truck,0.30103
car,0.30103
on,0.0
The,0.0
the,0.0


### 2.3 Compute TF-IDF

In [17]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [18]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)
print('tfidfBowA', tfidfBowA)
print('tfidfBowB', tfidfBowB)

tfidfBowA {'driven': 0.0, 'road': 0.043004285094854454, 'highway': 0.0, 'is': 0.0, 'truck': 0.0, 'car': 0.043004285094854454, 'on': 0.0, 'The': 0.0, 'the': 0.0}
tfidfBowB {'driven': 0.0, 'road': 0.0, 'highway': 0.043004285094854454, 'is': 0.0, 'truck': 0.043004285094854454, 'car': 0.0, 'on': 0.0, 'The': 0.0, 'the': 0.0}


In [19]:
import pandas as pd
df_tfidf = pd.DataFrame([tfidfBowA, tfidfBowB]).T
df_tfidf.columns = ['tfidfBowA', 'tfidfBowB']
df_tfidf.style

Unnamed: 0,tfidfBowA,tfidfBowB
driven,0.0,0.0
road,0.043004,0.0
highway,0.0,0.043004
is,0.0,0.0
truck,0.0,0.043004
car,0.043004,0.0
on,0.0,0.0
The,0.0,0.0
the,0.0,0.0


### 2.4 Compute TF-IDF Using sklearn

In [20]:
# given the following document
docA = 'The car is driven on the road'
docB = 'The truck is driven on the highway'
corpus = [docA,docB]

# compute tf-idf
tfidf = TfidfVectorizer()
output = tfidf.fit_transform([docA, docB])
# output = tfidf.fit_transform(corpus)

print(output)

  (0, 5)	0.42471718586982765
  (0, 4)	0.30218977576862155
  (0, 1)	0.30218977576862155
  (0, 3)	0.30218977576862155
  (0, 0)	0.42471718586982765
  (0, 6)	0.6043795515372431
  (1, 2)	0.42471718586982765
  (1, 7)	0.42471718586982765
  (1, 4)	0.30218977576862155
  (1, 1)	0.30218977576862155
  (1, 3)	0.30218977576862155
  (1, 6)	0.6043795515372431


In [21]:
# results
feature_names = tfidf.get_feature_names()
for col in output.nonzero()[1]:
    print(feature_names[col], ' : ', output[0,col])

road  :  0.42471718586982765
on  :  0.30218977576862155
driven  :  0.30218977576862155
is  :  0.30218977576862155
car  :  0.42471718586982765
the  :  0.6043795515372431
highway  :  0.0
truck  :  0.0
on  :  0.30218977576862155
driven  :  0.30218977576862155
is  :  0.30218977576862155
the  :  0.6043795515372431


In [22]:
# result into df
feature_names = tfidf.get_feature_names()
print('use as index: ', feature_names)
corpus_index = [word for word in corpus]
print('use a column name: ', corpus_index)

df = pd.DataFrame(output.T.todense(), index=feature_names, columns=corpus_index)
df.style

use as index:  ['car', 'driven', 'highway', 'is', 'on', 'road', 'the', 'truck']
use a column name:  ['The car is driven on the road', 'The truck is driven on the highway']


Unnamed: 0,The car is driven on the road,The truck is driven on the highway
car,0.424717,0.0
driven,0.30219,0.30219
highway,0.0,0.424717
is,0.30219,0.30219
on,0.30219,0.30219
road,0.424717,0.0
the,0.60438,0.60438
truck,0.0,0.424717


### 2.5 Summary

In [23]:
summary = pd.concat([df_wordcnt,df_tf, df_idf, df_tfidf], axis=1)
summary.style

Unnamed: 0,The car is driven on the road,The truck is driven on the highway,tfBowA,tfBowB,IDF,tfidfBowA,tfidfBowB
driven,1,1,0.142857,0.142857,0.0,0.0,0.0
road,1,0,0.142857,0.0,0.30103,0.043004,0.0
highway,0,1,0.0,0.142857,0.30103,0.0,0.043004
is,1,1,0.142857,0.142857,0.0,0.0,0.0
truck,0,1,0.0,0.142857,0.30103,0.0,0.043004
car,1,0,0.142857,0.0,0.30103,0.043004,0.0
on,1,1,0.142857,0.142857,0.0,0.0,0.0
The,1,1,0.142857,0.142857,0.0,0.0,0.0
the,1,1,0.142857,0.142857,0.0,0.0,0.0
