## 作業目標：搭建一個 bag of words 模型

---

In [32]:
import pandas as pd
import nltk
# nltk.download()
import numpy as np

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
corpus = dataset['Review'].values

> **read_csv: parameter: quoting=3**  
  CSV can just use a comma to separate fields, but if you have a field with a comma in it, to avoid that becoming two fields, the whole field needs to be enclosed, usually with double quotes. Some CSV dialects enclose every field in double quotes but that wastes space.  
  The options for quoting are QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3)  
  quoting=0 means minimal quoting so only a field that contains a separator will be enclosed. It will look something like this.  
  - [pandas.read_csv document](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)
  - [Quoting parameter in pandas read_csv()](https://stackoverflow.com/questions/43344241/quoting-parameter-in-pandas-read-csv)
  - [例子解释，pandas的pd.read_csv函数，quoting = 3是什么意思](https://blog.csdn.net/sinat_39416814/article/details/105579201)

### 從文本中取出所有單字

In [33]:
whole_words = []
for sentence in corpus:
    tokenized_sentence = nltk.word_tokenize(sentence)
    for word in tokenized_sentence:
        whole_words.append(word)

In [None]:
# ##### SOLUTION #####
# whole_words = []
# for sentence in corpus:
#     tokenized_sentence = nltk.word_tokenize(sentence)
#     for word in tokenized_sentence:
#         whole_words.append(word)

### 移除重複單字

In [34]:
whole_words = set(whole_words)
print('共有 {} 個單字'.format(len(whole_words)))

共有 2351 個單字


In [12]:
# ##### SOLUTION #####
# whole_words = set(whole_words)
# print('共有{}個單字'.format(len(whole_words)))

共有2356個單字


### 建立字典使每一個單字有對應數值

In [22]:
whole_words

['banana',
 'vomited',
 'to',
 'tea',
 'relax',
 'Classy/warm',
 'before',
 'fry',
 '70+',
 'pink',
 'mary',
 'dead',
 'Third',
 'middle',
 'juice',
 'reviewer',
 'recommendation',
 'Indian',
 'Mushroom',
 'horrible',
 'breakfast',
 'email',
 'thrilled',
 'being',
 'break',
 'maybe',
 'hoping',
 'Burger',
 'solidify',
 'BARGAIN',
 'Hands',
 'street',
 'crowds',
 'awkwardly',
 'Salad',
 'exquisite',
 'self',
 'myself',
 'mein',
 'coffee',
 'needed',
 'TV',
 'rather',
 'compliments',
 'playing',
 'fries',
 'dude',
 'though',
 'ask',
 'effort',
 'mistake',
 'Noca',
 'homemade',
 'struck',
 'wasting',
 'waiter',
 'Pita',
 'reheated',
 'first',
 'drunk',
 'overhaul',
 'style',
 'nutshell',
 'completely',
 'under-services',
 'chefs',
 'such',
 'blows',
 'avoid',
 'hit',
 'Hunan',
 'staying',
 'vain',
 'rest',
 'food',
 'deliciously',
 'disgusting',
 'use',
 'saying',
 'do',
 'fish',
 '3/4ths',
 'another',
 'possible',
 'almonds',
 'value',
 'cover',
 'pay',
 'Firehouse',
 'Do',
 'why',
 'Ser

In [35]:
word_index = {}
index_word = {}
n = 0
for word in whole_words:
    index_word[n] = word
    word_index[word] = n
    n += 1

In [None]:
# ##### SOLUTION #####
# word_index = {}
# index_word = {}
# n = 0
# for word in whole_words:
#     word_index[word] = n 
#     index_word[n] = word
#     n+=1
    

In [36]:
word_index

{'banana': 0,
 'vomited': 1,
 'to': 2,
 'tea': 3,
 'relax': 4,
 'Classy/warm': 5,
 'before': 6,
 'fry': 7,
 '70+': 8,
 'pink': 9,
 'mary': 10,
 'dead': 11,
 'Third': 12,
 'middle': 13,
 'juice': 14,
 'reviewer': 15,
 'recommendation': 16,
 'Indian': 17,
 'Mushroom': 18,
 'horrible': 19,
 'breakfast': 20,
 'email': 21,
 'thrilled': 22,
 'being': 23,
 'break': 24,
 'maybe': 25,
 'hoping': 26,
 'Burger': 27,
 'solidify': 28,
 'BARGAIN': 29,
 'Hands': 30,
 'street': 31,
 'crowds': 32,
 'awkwardly': 33,
 'Salad': 34,
 'exquisite': 35,
 'self': 36,
 'myself': 37,
 'mein': 38,
 'coffee': 39,
 'needed': 40,
 'TV': 41,
 'rather': 42,
 'compliments': 43,
 'playing': 44,
 'fries': 45,
 'dude': 46,
 'though': 47,
 'ask': 48,
 'effort': 49,
 'mistake': 50,
 'Noca': 51,
 'homemade': 52,
 'struck': 53,
 'wasting': 54,
 'waiter': 55,
 'Pita': 56,
 'reheated': 57,
 'first': 58,
 'drunk': 59,
 'overhaul': 60,
 'style': 61,
 'nutshell': 62,
 'completely': 63,
 'under-services': 64,
 'chefs': 65,
 'such':

In [26]:
index_word

{0: 'banana',
 1: 'vomited',
 2: 'to',
 3: 'tea',
 4: 'relax',
 5: 'Classy/warm',
 6: 'before',
 7: 'fry',
 8: '70+',
 9: 'pink',
 10: 'mary',
 11: 'dead',
 12: 'Third',
 13: 'middle',
 14: 'juice',
 15: 'reviewer',
 16: 'recommendation',
 17: 'Indian',
 18: 'Mushroom',
 19: 'horrible',
 20: 'breakfast',
 21: 'email',
 22: 'thrilled',
 23: 'being',
 24: 'break',
 25: 'maybe',
 26: 'hoping',
 27: 'Burger',
 28: 'solidify',
 29: 'BARGAIN',
 30: 'Hands',
 31: 'street',
 32: 'crowds',
 33: 'awkwardly',
 34: 'Salad',
 35: 'exquisite',
 36: 'self',
 37: 'myself',
 38: 'mein',
 39: 'coffee',
 40: 'needed',
 41: 'TV',
 42: 'rather',
 43: 'compliments',
 44: 'playing',
 45: 'fries',
 46: 'dude',
 47: 'though',
 48: 'ask',
 49: 'effort',
 50: 'mistake',
 51: 'Noca',
 52: 'homemade',
 53: 'struck',
 54: 'wasting',
 55: 'waiter',
 56: 'Pita',
 57: 'reheated',
 58: 'first',
 59: 'drunk',
 60: 'overhaul',
 61: 'style',
 62: 'nutshell',
 63: 'completely',
 64: 'under-services',
 65: 'chefs',
 66: 'su

## 轉換句子為bag of words型式

In [37]:
def _get_bag_of_words_vector(sentence, word_index_dic, whole_words):
    sentence = sentence
    vector = np.zeros(len(whole_words))
    for word in nltk.word_tokenize(sentence):
        if word in whole_words:
            vector[word_index_dic[word]] += 1
    return vector


In [None]:
# ##### SOLUTION #####
# def _get_bag_of_words_vector(sentence, word_index_dic, whole_words):
#     sentence = sentence
#     vector = np.zeros(len(whole_words))
#     for word in nltk.word_tokenize(sentence):
#         if word in whole_words:
#             vector[word_index[word]]+=1
#     return vector


In [38]:
_get_bag_of_words_vector('Wow... Loved this place.', word_index, whole_words)

array([0., 0., 0., ..., 0., 0., 0.])