## 運用sklearn API實現 bag of words

In [96]:
import pandas as pd
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [97]:
corpus=dataset['Review'].values

## 轉bag-of-words vector

In [98]:
from sklearn.feature_extraction.text import CountVectorizer
#Creating bag of word model
#max_features是要建造幾個column，會按造字出現的高低去篩選，如果給1000代表出現頻率前1000個單字才會納入計算，其他會排除
cv=CountVectorizer(max_features=1000)
#toarray是建造matrixs
#X現在為sparsity就是很多零的matrix
X=cv.fit_transform(corpus).toarray()
y=dataset.iloc[:,1].values

In [99]:
print('X shape : {}'.format(X.shape))

X shape : (1000, 1000)


* 查看每個單字相對應的index

In [100]:
cv.get_feature_names() ## 第一個單字對應到的index就是0，以此類推

['10',
 '100',
 '12',
 '20',
 '30',
 '35',
 '40',
 'about',
 'above',
 'absolutely',
 'acknowledged',
 'actually',
 'added',
 'after',
 'again',
 'ago',
 'all',
 'almost',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'ambiance',
 'ambience',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'appetizers',
 'are',
 'area',
 'aren',
 'around',
 'arrived',
 'as',
 'ask',
 'asked',
 'assure',
 'at',
 'ate',
 'atmosphere',
 'attack',
 'attentive',
 'attitudes',
 'authentic',
 'average',
 'avoid',
 'away',
 'awesome',
 'awful',
 'baby',
 'bachi',
 'back',
 'bacon',
 'bad',
 'bagels',
 'bakery',
 'bar',
 'barely',
 'bars',
 'bartender',
 'basically',
 'bathroom',
 'bathrooms',
 'batter',
 'bay',
 'be',
 'beans',
 'beat',
 'beautiful',
 'because',
 'become',
 'beef',
 'been',
 'beer',
 'before',
 'behind',
 'being',
 'believe',
 'belly',
 'best',
 'better',
 'between',
 'beyond',
 'big',
 'bill',
 'biscuits',
 'bisque',
 'bit',
 'bitches',
 'b

* 上方fit過corpus，這裡之需 transformr就可以轉換其他句子

In [101]:
cv.transform([cv.get_feature_names()[0]]).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [102]:
cv.transform(['This is so good']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 