## Sumber
https://github.com/choirudinemcha/1001DigitalTalents/blob/master/SA.ipynb

In [1]:
corpus = ['I saw the saw.',
     'I saw her standing there.',
     'Ofcourse I give her an umbrella.',
     'But she give me the saw.',
     'I come but she run.']

In [2]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [3]:
vect.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [4]:

# examine the fitted vocabulary
vect.get_feature_names()

['an',
 'but',
 'come',
 'give',
 'her',
 'me',
 'ofcourse',
 'run',
 'saw',
 'she',
 'standing',
 'the',
 'there',
 'umbrella']

In [5]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(corpus)

In [8]:
simple_train_dtm

<5x14 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [6]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [7]:
import pandas as pd
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,an,but,come,give,her,me,ofcourse,run,saw,she,standing,the,there,umbrella
0,0,0,0,0,0,0,0,0,2,0,0,1,0,0
1,0,0,0,0,1,0,0,0,1,0,1,0,1,0
2,1,0,0,1,1,0,1,0,0,0,0,0,0,1
3,0,1,0,1,0,1,0,0,1,1,0,1,0,0
4,0,1,1,0,0,0,0,1,0,1,0,0,0,0


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['an', 'but', 'come', 'give', 'her', 'me', 'ofcourse', 'run', 'saw', 'she', 'standing', 'the', 'there', 'umbrella']


In [10]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,an,but,come,give,her,me,ofcourse,run,saw,she,standing,the,there,umbrella
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.856606,0.0,0.0,0.515971,0.0,0.0
1,0.0,0.0,0.0,0.0,0.45827,0.0,0.0,0.0,0.380406,0.0,0.568014,0.0,0.568014,0.0
2,0.48214,0.0,0.0,0.388988,0.388988,0.0,0.48214,0.0,0.0,0.0,0.0,0.0,0.0,0.48214
3,0.0,0.400791,0.0,0.400791,0.0,0.49677,0.0,0.0,0.332693,0.400791,0.0,0.400791,0.0,0.0
4,0.0,0.444002,0.550329,0.0,0.0,0.0,0.0,0.550329,0.0,0.444002,0.0,0.0,0.0,0.0


In [11]:
# example text for model testing
simple_test = ["i give the saw"]

In [12]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [13]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,an,but,come,give,her,me,ofcourse,run,saw,she,standing,the,there,umbrella
0,0,0,0,1,0,0,0,0,1,0,0,1,0,0


In [14]:
from sklearn.cross_validation import train_test_split



In [15]:
corpus = []
corpus.append({'text':'I saw the saw.','sentiment':'0'})
corpus.append({'text':'I saw her standing there.', 'sentiment':'1'})
corpus.append({'text':'Ofcourse I give her an umbrella.','sentiment':'1'})
corpus.append({'text':'But she give me the saw.', 'sentiment':'-1'})
corpus.append({'text':'I come but she run.', 'sentiment':'-1'})

In [16]:
news_corpus = pd.DataFrame.from_dict(corpus)

In [17]:
X = news_corpus.text
y = news_corpus.sentiment
print(X.shape)
print(y.shape)

(5,)
(5,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4,)
(1,)
(4,)
(1,)


In [19]:
news_corpus

Unnamed: 0,sentiment,text
0,0,I saw the saw.
1,1,I saw her standing there.
2,1,Ofcourse I give her an umbrella.
3,-1,But she give me the saw.
4,-1,I come but she run.


In [20]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [21]:
X_test_dtm = vect.transform(X_test)

In [22]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [23]:
%time nb.fit(X_train_dtm, y_train)

Wall time: 976 µs


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
y_pred_class = nb.predict(X_test_dtm)

In [25]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.0

In [26]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[0, 0],
       [1, 0]], dtype=int64)

In [27]:
X_test

2    Ofcourse I give her an umbrella.
Name: text, dtype: object

In [32]:
test1 = ['I saw the run']
new_article_vect = vect.transform(test1)
nb.predict(new_article_vect)

array(['0'], dtype='<U2')