# Vector Space Models

## Tokenization

Tokenize a string into linguistic tokens:

```python
"I'm Dr. Jinho Choi. Call me Jinho."
['I', "'m", 'Dr.', 'Jinho', 'Choi', '.', 'Call', 'me', 'Jinho', '.']
```

In [10]:
!pip install tokenizer



In [21]:
from typing import Union, List, Set
from tokenizer import tokenize

# if flag = 0, return a list
def tok(text: str, flag: int=0) -> Union[List[str], Set[str]]:
    """
    @return if flag is 0, a list of tokens; otherwise, a set of tokens.
    """
    if flag == 0:
        return [t.txt for t in tokenize(text) if t.txt]
    else:
        return {t.txt for t in tokenize(text) if t.txt}

In [22]:
text = "I'm Dr. Jinho Choi. Call me Jinho."
print(tok(text))
print(tok(text, 1))

["I'm", 'Dr.', 'Jinho', 'Choi', '.', 'Call', 'me', 'Jinho', '.']
{'Choi', 'Call', '.', 'Dr.', 'me', 'Jinho', "I'm"}


### Exercise

Update the `tok` function such that it returns:
* if `flag` is `0`, a list of tokens
* if `flag` is `1`, a set of tokens
* otherwise, a dictionary where the key is a token and the value is its count

In [23]:
from typing import Dict
from collections import Counter

def tok(text: str, flag: int=2) -> Union[List[str], Set[str], Dict[str, int]]:
    """
    @return if flag is 0, a list of tokens; if flag is 1, a set of tokens; otherwise, a dictionary where the key is a token and the value is its count.
    """
    if flag == 0:
        return [t.txt for t in tokenize(text) if t.txt]
    elif flag == 1:
        return {t.txt for t in tokenize(text) if t.txt}
    else:
        return Counter([t.txt for t in tokenize(text) if t.txt])

In [24]:
print(tok(text, 2))

Counter({'Jinho': 2, '.': 2, "I'm": 1, 'Dr.': 1, 'Choi': 1, 'Call': 1, 'me': 1})


## Bag-of-Words

Read a file and return a dictionary of (token, value) pairs in the document.

In [25]:
def bag_of_words(filename: str) -> Dict[str, int]:
    fin = open(filename)
    return tok(fin.read())

In [26]:
bow = bag_of_words('res/george_washington.txt')
for k, v in sorted(bow.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(k, v)

the 26
, 25
and 16
of 14
. 14
in 12
was 7
He 7
- 6
his 6


## TF-IDF

$$\mathrm{tf}\cdot\mathrm{idf}_{w,d} = \mathrm{tf}_{w,d} \cdot \log\frac{|D|}{\mathrm{df}_w}$$

In [27]:
al = bag_of_words('res/abraham_lincoln.txt')
fr = bag_of_words('res/franklin_roosevelt.txt')
gw = bag_of_words('res/george_washington.txt')
kb = bag_of_words('res/kobe_bryant.txt')
lj = bag_of_words('res/lebron_james.txt')
mj = bag_of_words('res/michael_jordan.txt')

### Exercise

Write a function that takes a list of bag_of_words and returns a dictionary whose key is a token and the value is its document frequency.

```python
def doc_freq(*bow_list: Dict[str, int]) -> Dict[str, int]:
    pass
```

In [28]:
def doc_freq(*bow_list: Dict[str, int]) -> Dict[str, int]:
    '''
    @return a dictionary whose key is a token and the value is its document frequency.
    '''
    d = {}
    
    for bow in bow_list:
        for token in bow:
            d[token] = d.get(token, 0) + 1
    
    return d

In [29]:
df = doc_freq(al, fr, gw, kb, lj, mj)
print(df['the'])
print(df['NBA'])
print(df['president'])

6
3
3


In [30]:
import math

def tfidf(bow: Dict[str, int], df: Dict[str, int]) -> Dict[str, float]:
    '''
    @return a dictionary where the key is a token and the value is its TF-IDF score.
    '''
    return {token: tf * math.log(len(df) / df[token]) for token, tf in bow.items()}

In [31]:
for k, v in sorted(mj.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(k, v)

the 39
, 34
. 25
in 20
and 20
of 20
- 14
Jordan 13
NBA 13
a 12


In [32]:
for k, v in sorted(tfidf(mj, df).items(), key=lambda x: x[1], reverse=True)[:10]:
    print(k, v)

the 205.4799589740981
, 179.13637449023938
. 131.71792241929364
in 105.37433793543492
and 105.37433793543492
of 105.37433793543492
Jordan 91.78619275799741
NBA 77.50423300531199
- 73.76203655480444
a 63.224602761260954


### Exercie

Modify the `tfidf` function that returns more meaningful scores given this small set of documents.

In [33]:
def tfidf2(bow: Dict[str, int], df: Dict[str, int]) -> Dict[str, float]:
    '''
    @return a dictionary where the key is a token and the value is its TF-IDF score.
    '''
    return {token: tf / df[token]**2 for token, tf in bow.items()}

In [34]:
for k, v in sorted(tfidf2(mj, df).items(), key=lambda x: x[1], reverse=True)[:10]:
    print(k, v)

Jordan 13.0
Bulls 4.0
titles 3.0
Michael 2.0
Wizards 2.0
owner 2.0
Smith 2.0
Air 2.0
1992 2.0
1993 2.0


## Cosine Similarity

$$
cos(A, B) = \frac{\sum_{i=1}^n A_i B_i}{\sqrt{\sum_{i=1}^n A_i^2} \sqrt{\sum_{i=1}^n B_i^2}}
$$

In [35]:
def cos(A: Dict[str, float], B: Dict[str, float]) -> float:
    num = sum([a * B.get(token, 0) for token, a in A.items()])
    den = math.sqrt(sum([v**2 for v in A.values()])) * math.sqrt(sum([v**2 for v in B.values()]))
    return num / den

In [36]:
labels = ['AL', 'FR', 'GW', 'KB', 'LJ', 'MJ']
docs_raw = [al, fr, gw, kb, lj, mj]

print('     '+'    '.join(labels))
for i, doc_a in enumerate(docs_raw):
    s = ['%4.2f' % cos(doc_a, doc_b) for doc_b in docs]
    print(labels[i]+'  '+'  '.join(s))

     AL    FR    GW    KB    LJ    MJ


NameError: name 'docs' is not defined

In [37]:
docs_tfidf = [tfidf(doc, df) for doc in docs_raw]

print('     '+'    '.join(labels))
for i, doc_a in enumerate(docs_tfidf):
    s = ['%4.2f' % cos(doc_a, doc_b) for doc_b in docs]
    print(labels[i]+'  '+'  '.join(s))

     AL    FR    GW    KB    LJ    MJ


NameError: name 'docs' is not defined

In [38]:
docs_tfidf2 = [tfidf2(doc, df) for doc in docs_raw]

print('     '+'    '.join(labels))
for i, doc_a in enumerate(docs_tfidf2):
    s = ['%4.2f' % cos(doc_a, doc_b) for doc_b in docs]
    print(labels[i]+'  '+'  '.join(s))

     AL    FR    GW    KB    LJ    MJ


NameError: name 'docs' is not defined

## Clustering

In [39]:
def term_index(df: Dict[str, int]) -> Dict[str, int]:
    return {k: i for i, k in enumerate(df.keys())}

In [40]:
ti = term_index(df)

In [41]:
import numpy as np

def doc_to_vec(tfidf, ti):
    n = np.zeros(len(ti))
    for token, score in tfidf.items():
        n[ti[token]] = score
    return n

In [42]:
from sklearn.cluster import KMeans

X = np.array([doc_to_vec(doc, ti) for doc in docs_tfidf])
kmeans = KMeans(n_clusters=2).fit(X)
kmeans.labels_

array([0, 1, 0, 0, 0, 0], dtype=int32)

In [43]:
from sklearn.cluster import AgglomerativeClustering
aggl = AgglomerativeClustering(affinity='cosine', linkage='complete').fit(X)
aggl.labels_

array([1, 1, 1, 0, 0, 0])