SEE ALSO WIKIPAGE: https://exmediawiki.khm.de/exmediawiki/index.php/Word_embeddings#Traditionelle_Worteinbettungen

---

- [1. one hot encoding](#1.-one-hot-encoding)
- [2. Bag of Words (BOW)](#2.-Bag-of-Words-(BOW))
- [Term Frequency (TF)](#Term-Frequency-(TF))
- [Inverse Data Frequency (IDF)](#Inverse-Data-Frequency-(IDF))
- [3. TF-IDF](#3.-TF-IDF)

## 1. one hot encoding

In [3]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder #für integer encoding
from sklearn.preprocessing import OneHotEncoder #für one hot encoding

# handmadedataset:
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)
print(values)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']


In [4]:
# integer encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

[0 0 2 0 1 1 2 0 2 1]


In [5]:
# binary encoding
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## 2. Bag of Words (BOW)

In [6]:
import pandas as pd

documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
    
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

df = pd.DataFrame([numOfWordsA, numOfWordsB])
print(df)

   walk  around  sat  out  a  went  for  fire  man  the  children
0     1       0    0    1  1     1    1     0    1    1         0
1     0       1    1    0  0     0    0     1    0    2         1


## Term Frequency (TF)

In [7]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(tfA)
print(tfB)

{'walk': 0.14285714285714285, 'around': 0.0, 'sat': 0.0, 'out': 0.14285714285714285, 'a': 0.14285714285714285, 'went': 0.14285714285714285, 'for': 0.14285714285714285, 'fire': 0.0, 'man': 0.14285714285714285, 'the': 0.14285714285714285, 'children': 0.0}
{'walk': 0.0, 'around': 0.16666666666666666, 'sat': 0.16666666666666666, 'out': 0.0, 'a': 0.0, 'went': 0.0, 'for': 0.0, 'fire': 0.16666666666666666, 'man': 0.0, 'the': 0.3333333333333333, 'children': 0.16666666666666666}


## Inverse Data Frequency (IDF)

In [8]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

{'walk': 0.6931471805599453, 'around': 0.6931471805599453, 'sat': 0.6931471805599453, 'out': 0.6931471805599453, 'a': 0.6931471805599453, 'went': 0.6931471805599453, 'for': 0.6931471805599453, 'fire': 0.6931471805599453, 'man': 0.6931471805599453, 'the': 0.0, 'children': 0.6931471805599453}


## 3. TF-IDF

In [9]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [10]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

df = pd.DataFrame([tfidfA, tfidfB])
print(df)

       walk    around       sat       out         a      went       for  \
0  0.099021  0.000000  0.000000  0.099021  0.099021  0.099021  0.099021   
1  0.000000  0.115525  0.115525  0.000000  0.000000  0.000000  0.000000   

       fire       man  the  children  
0  0.000000  0.099021  0.0  0.000000  
1  0.115525  0.000000  0.0  0.115525  
