In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
from scipy.spatial.distance import cosine

### Term-Document Matrix

In [2]:
x = np.array([[1,0,2,0,0],[0,1,1,0,0],[0,1,1,0,1],[0,0,0,1,1]])
index = pd.DataFrame(x, index = ["auto","car","wash","machine"], columns=[0,1,2,3,4])
index

Unnamed: 0,0,1,2,3,4
auto,1,0,2,0,0
car,0,1,1,0,0
wash,0,1,1,0,1
machine,0,0,0,1,1


In [3]:
index[1]

auto       0
car        1
wash       1
machine    0
Name: 1, dtype: int32

In [4]:
index.loc["auto"]

0    1
1    0
2    2
3    0
4    0
Name: auto, dtype: int32

In [9]:
index.shape

(4, 5)

### Term frequency
문서 d의 단어마다 값을 가짐
1. log(문서 d에서 t 단어의 총 빈도+1)
2. 0.5 + (0.5 * d에서 t 단어의 총 빈도) / (d에서 가장 많은 단어 빈도)

In [125]:
def tf(index, term):
    term_freq= index[term]
    term_freq = 0.5 + (0.5 * term_freq) / max(index)
    return term_freq

In [126]:
tf(index[0],"auto")

1.0

### Inverse document frequency
단어마다 1개의 값을 가짐
1. log(전체 문서의 수 / 해당 단어를 포함한 문서의 수)

In [120]:
def idf(index, term):
    _, n_docs = index.shape
    df = np.count_nonzero(index.loc[term])
    return np.log(float(n_docs)/df+1)

In [121]:
print(idf(index,"car"), idf(index,"auto"), idf(index,"wash"))

1.252762968495368 1.252762968495368 0.9808292530117263


In [141]:
def simple_dot(a,b):
    dsum=0.
    a_ = []
    b_ = []
    for i in index.index.values:
        a_.append(tf(a, i) * idf(index, i))
        b_.append(tf(b, i) * idf(index, i))
    a_, b_ = np.array(a_), np.array(b_)
    return np.dot(a_,b_)

In [144]:
simple_dot(index[3],index[4])

2.8351355946317605

### Cosine similarity

In [146]:
def l2_norm(a):
    return np.sqrt(np.dot(a,a))

In [148]:
def tf_idf_cos(a,b):
    return simple_dot(a,b) / (l2_norm(a)*l2_norm(b))

In [149]:
tf_idf_cos(index[0],index[1])

1.7273075975345362

In [150]:
tf_idf_cos(index[1],index[2])

0.8878613973338477