# Toy Example

In this chapter, we are going to show a toy example.

In [23]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter
from collections import OrderedDict
import copy
from math import log

## 1. build corpus

我们首先创建一个有三个documents 的corpus.

In [1]:
corpus = ['Football is popular in Brasil',
         'Basketball is popular in USA',
         'Ping-Pong is popular in China']

## 2. preprocessing

下面我们对每个document 进行预处理

### 2.1 case normalization



In [6]:
corpus_lower = [doc.lower() for doc in corpus]
list(corpus_lower)

['football is popular in brasil',
 'basketball is popular in usa',
 'ping-pong is popular in china']

### 2.2 tokenization

In [10]:
tokenizer = TreebankWordTokenizer()

corpus_tokens = []

for doc in corpus_lower:
    corpus_tokens += [sorted(tokenizer.tokenize(doc))]

corpus_tokens

[['brasil', 'football', 'in', 'is', 'popular'],
 ['basketball', 'in', 'is', 'popular', 'usa'],
 ['china', 'in', 'is', 'ping-pong', 'popular']]

### 2.3 remove stop words

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
corpus_tokens_wo_stop_word = []

for doc_tokens in corpus_tokens:
    corpus_tokens_wo_stop_word.append([w for w in doc_tokens if w not in stopwords])

corpus_tokens_wo_stop_word   

[['brasil', 'football', 'popular'],
 ['basketball', 'popular', 'usa'],
 ['china', 'ping-pong', 'popular']]

### 2.4 build lexicon


In [15]:
corpus_all_tokens = sum(corpus_tokens_wo_stop_word, [])
corpus_all_tokens

['brasil',
 'football',
 'popular',
 'basketball',
 'popular',
 'usa',
 'china',
 'ping-pong',
 'popular']

In [16]:
corpus_lexicon = sorted(set(corpus_all_tokens))  # 去重
corpus_lexicon

['basketball', 'brasil', 'china', 'football', 'ping-pong', 'popular', 'usa']

## 3. bag of words model

In [18]:
zero_vector = OrderedDict((token, 0) for token in corpus_lexicon)
zero_vector

OrderedDict([('basketball', 0),
             ('brasil', 0),
             ('china', 0),
             ('football', 0),
             ('ping-pong', 0),
             ('popular', 0),
             ('usa', 0)])

In [20]:
corpus_vectors = []

# 对每个document 生成一个vector，append 到sample_docs_vectors 列表
for doc_tokens in corpus_tokens_wo_stop_word:
    
    # 复制一个初始化全零的vector
    vec = copy.copy(zero_vector)
    
    # tokenization
    token_counts = Counter(doc_tokens)
    
    for key, value in token_counts.items():
        vec[key] = value
        
    corpus_vectors.append(vec)
    
corpus_vectors

[OrderedDict([('basketball', 0),
              ('brasil', 1),
              ('china', 0),
              ('football', 1),
              ('ping-pong', 0),
              ('popular', 1),
              ('usa', 0)]),
 OrderedDict([('basketball', 1),
              ('brasil', 0),
              ('china', 0),
              ('football', 0),
              ('ping-pong', 0),
              ('popular', 1),
              ('usa', 1)]),
 OrderedDict([('basketball', 0),
              ('brasil', 0),
              ('china', 1),
              ('football', 0),
              ('ping-pong', 1),
              ('popular', 1),
              ('usa', 0)])]

我们可以看到，bag of words model 最后出来的是下面一个matrix

|    BOW     | basketball | brasil | china | football | ping-pong | popular | usa |
|------------|------------|--------|-------|----------|-----------|---------|-----|
| document 1 |      0     |    1   |   0   |     1    |     0     |    1    |  0  |
| document 2 |      1     |    0   |   0   |     0    |     0     |    1    |  1  |
| docuemnt 3 |      0     |    0   |   1   |     0    |     1     |    1    |  0  |

## 4. TF-IDF

下面，我们介绍TF-IDF.


### 4.1 Term Frequency (TF)

#### 计算公式
for a given term, **t**, in a given document, **d**, in a corpus, **D**, you get:

$tf(t, d) = \frac{count(t)}{count(d)}$

每个token 的计数除以该document 中所有token 的个数。

根据上述公式，我们可以得出以下计算结果：

|    TF      | basketball | brasil | china | football | ping-pong | popular | usa |
|------------|------------|--------|-------|----------|-----------|---------|-----|
| document 1 |      0     |   1/3  |   0   |    1/3   |     0     |   1/3   |  0  |
| document 2 |     1/3    |    0   |   0   |     0    |     0     |   1/3   | 1/3 |
| docuemnt 3 |      0     |    0   |  1/3  |     0    |    1/3    |   1/3   |  0  |

In [22]:
corpus_tf = []

for doc_tokens in corpus_tokens_wo_stop_word:
    
    doc_length = len(doc_tokens)
    
    # 复制一个初始化全零的vector
    vec = copy.copy(zero_vector)
    
    # tokenization
    token_counts = Counter(doc_tokens)
    
    for key, value in token_counts.items():
        vec[key] = value / doc_length
        
    corpus_tf.append(vec)
    
corpus_tf

[OrderedDict([('basketball', 0),
              ('brasil', 0.3333333333333333),
              ('china', 0),
              ('football', 0.3333333333333333),
              ('ping-pong', 0),
              ('popular', 0.3333333333333333),
              ('usa', 0)]),
 OrderedDict([('basketball', 0.3333333333333333),
              ('brasil', 0),
              ('china', 0),
              ('football', 0),
              ('ping-pong', 0),
              ('popular', 0.3333333333333333),
              ('usa', 0.3333333333333333)]),
 OrderedDict([('basketball', 0),
              ('brasil', 0),
              ('china', 0.3333333333333333),
              ('football', 0),
              ('ping-pong', 0.3333333333333333),
              ('popular', 0.3333333333333333),
              ('usa', 0)])]

### 4.2 Inverse Document Frequency (IDF)

#### 计算公式

for a given term, **t**, in a given document, **d**, in a corpus, **D**, you get:

$idf(t, D) = log \frac{number\:of\:documents}{number\:of\:documents\:containing\:t}$ 

所有document 的个数除以包含某个token 的documents 的个数。

根据上述公式，我们可以得出以下计算结果：


| token      | IDF           |
|------------|---------------|
| basketball | log(3/1) = log3 = 0.477 |
| brasil     | log(3/1) = log3 = 0.477 |
| china      | log(3/1) = log3 = 0.477 |
| football   | log(3/1) = log3 = 0.477 |
| ping-pong  | log(3/1) = log3 = 0.477 |
| popular    | log(3/3) = 0     |
| usa        | log(3/1) = log3 = 0.477 |

In [66]:
corpus_idf = {}
total_document = len(corpus)

for token in corpus_lexicon:
    count = 0
    for doc_tokens in corpus_tokens_wo_stop_word:
        if token in doc_tokens:
            count += 1
    
    corpus_idf[token] = log(total_document / count, 10)  # 注意，这里是log
    # corpus_idf[token] = log(((1 + total_document) / (1 + count)), 10) + 1

In [67]:
corpus_idf

{'basketball': 0.47712125471966244,
 'brasil': 0.47712125471966244,
 'china': 0.47712125471966244,
 'football': 0.47712125471966244,
 'ping-pong': 0.47712125471966244,
 'popular': 0.0,
 'usa': 0.47712125471966244}

### 4.3 TF-IDF

$tf\_idf(t, d, D) = tf(t, d) * idf(t, D)$

根据上述公式，我们可以得出以下计算结果：


|    TF      | basketball | brasil  | china | football | ping-pong | popular | usa |
|------------|------------|---------|-------|----------|-----------|---------|-----|
| document 1 |      0     | 1/3*log3|   0   | 1/3*log3 |     0     |   1/3*0 |  0  |
| document 2 |  1/3*log3  |    0    |   0   |     0    |     0     |   1/3*0 | 1/3*log3 |
| docuemnt 3 |      0     |    0    |1/3*log3|     0   |  1/3*log3 |   1/3*0 |  0  |

#### 最终结果

|    TF      | basketball | brasil | china | football | ping-pong | popular | usa |
|------------|------------|--------|-------|----------|-----------|---------|-----|
| document 1 |      0     |   0.16 |   0   |    0.16  |     0     |    0    |  0  |
| document 2 |      0.16  |    0   |   0   |     0    |     0     |    0    | 0.16|
| docuemnt 3 |      0     |    0   |  0.16 |     0    |    0.16   |    0    |  0  |

In [68]:
tf_idf = []

for tf in corpus_tf:
    
    vec = copy.copy(zero_vector)
    
    for key, value in tf.items():
        vec[key] = value * corpus_idf[key]
    
    tf_idf.append(vec)

tf_idf

[OrderedDict([('basketball', 0.0),
              ('brasil', 0.15904041823988746),
              ('china', 0.0),
              ('football', 0.15904041823988746),
              ('ping-pong', 0.0),
              ('popular', 0.0),
              ('usa', 0.0)]),
 OrderedDict([('basketball', 0.15904041823988746),
              ('brasil', 0.0),
              ('china', 0.0),
              ('football', 0.0),
              ('ping-pong', 0.0),
              ('popular', 0.0),
              ('usa', 0.15904041823988746)]),
 OrderedDict([('basketball', 0.0),
              ('brasil', 0.0),
              ('china', 0.15904041823988746),
              ('football', 0.0),
              ('ping-pong', 0.15904041823988746),
              ('popular', 0.0),
              ('usa', 0.0)])]

## 5. 使用sklearn



In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TreebankWordTokenizer()

# corpus_wo_stopwords = []

# for doc_tokens in corpus_tokens_wo_stop_word:
#     corpus_wo_stopwords.append(' '.join(doc_tokens))

# print('corpus without stopwords: ', corpus_wo_stopwords)

"""
使用与前面相同的预处理
- tokenizer
- lowercase
- stopwords
""" 
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, stop_words='english', lowercase=True)

model = vectorizer.fit_transform(corpus)

print(model.todense().round(2))

[[0.   0.65 0.   0.65 0.   0.39 0.  ]
 [0.65 0.   0.   0.   0.   0.39 0.65]
 [0.   0.   0.65 0.   0.65 0.39 0.  ]]


In [52]:
feature_names = vectorizer.get_feature_names()
feature_names

['basketball', 'brasil', 'china', 'football', 'ping-pong', 'popular', 'usa']