## Import thư viện

In [1]:
%matplotlib inline

# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy as np

# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# random, itertools, matplotlib
import random
import itertools
import matplotlib.pyplot as plt



## Class đọc từng dòng trong file text
Mỗi dòng lúc này được xem như một paragraph

In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc toàn bộ nội dung trong file text
Mỗi file text lúc này được xem như một paragraph
(tên class này giống hệt tên class trên, chọn 1 trong 2 class thôi)

In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():
            with open (source, 'r' ,encoding="utf8") as fin:
                content=fin.read().replace('\n', '')
                self.sentences.append(LabeledSentence(utils.to_unicode(content).split(), [prefix + '_%s' % item_no]))
                item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

### Xét đường dẫn tới dataset

In [3]:
# sources = {
#     'data/movie reviews/test-neg.txt':'TEST_NEG',
#     'data/movie reviews/test-pos.txt':'TEST_POS', 
#     'data/movie reviews/train-neg.txt':'TRAIN_NEG', 
#     'data/movie reviews/train-pos.txt':'TRAIN_POS',    
# }
sources = {'data/movie reviews/sample.txt':'SAMPLE',}

sentences = LabeledLineSentence(sources)

### Xét tham số cho model, build vocabulary.

In [7]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



### Train model

In [8]:
import timeit
start = timeit.default_timer()
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=1)
stop = timeit.default_timer()
print(stop - start)

0.11624063851482092


### Tìm các từ tương tự với một từ được chọn

In [10]:
model.most_similar('hồn')

  """Entry point for launching an IPython kernel.


[('và', 0.2804310917854309),
 ('gì,', 0.22228646278381348),
 ('trong', 0.20173762738704681),
 ('giúp', 0.200840026140213),
 ('Sĩ,', 0.1889793574810028),
 ('đến', 0.17195986211299896),
 ('đạo,', 0.16080373525619507),
 ('tại', 0.15913458168506622),
 ('trưng', 0.15474288165569305),
 ('đáo', 0.14574363827705383)]

### Xuất ra vector của doc đã train dựa vào tag của doc đó

In [12]:
model.docvecs['SAMPLE_0']

array([ 1.9180225e-03, -4.8305378e-03, -1.9694823e-03, -3.5324236e-03,
       -4.8138779e-03,  2.0420922e-03,  1.3339991e-04, -1.3350544e-03,
        4.7775670e-03, -2.3314650e-03,  2.9663115e-03,  3.1428877e-03,
        3.8340779e-03, -2.1167246e-03,  2.8462938e-04, -8.0698094e-04,
        9.6346112e-04, -4.2817262e-03, -7.8171914e-05,  4.2274781e-04,
       -2.2594433e-03,  2.3017626e-03,  4.4055484e-04, -9.9025329e-04,
        5.1176641e-04,  9.7382365e-04, -3.0314591e-04,  3.4633272e-03,
        2.6948685e-03, -4.9898350e-03,  2.2530193e-03, -4.6786955e-03,
        4.4693165e-03, -2.1491171e-04, -3.7276214e-03,  1.5188266e-03,
       -4.4356044e-03, -4.4513675e-03, -5.8987818e-04, -1.7631575e-03,
        7.4141164e-04,  1.3792495e-03,  4.6116393e-03,  2.8618886e-03,
       -2.6478977e-03, -3.0130174e-03, -1.7630759e-03,  3.4416362e-03,
       -2.2416282e-03, -4.9265227e-03,  2.1799877e-03,  3.8180896e-03,
        1.3784419e-03, -3.2269405e-03, -3.9294125e-03,  1.1749561e-03,
      

In [15]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [16]:
test_data = word_tokenize("Chiều nay ăn gì".lower())

In [17]:
test_data

['chiều', 'nay', 'ăn', 'gì']

In [18]:
v1 = model.infer_vector(test_data)
v1

array([ 0.00275694,  0.00495526,  0.00199558, -0.00445509, -0.00102917,
       -0.00241156,  0.0004759 , -0.00012678, -0.00431482, -0.0036441 ,
        0.00258672,  0.00284997, -0.00460165,  0.00104439, -0.00312406,
       -0.00389866, -0.00148891,  0.00376602, -0.00413713,  0.00251991,
       -0.00321384, -0.00280984, -0.00160691,  0.00252372, -0.00105392,
       -0.00226692, -0.00375468, -0.00385992, -0.0040242 , -0.00160024,
       -0.00307054,  0.00012227, -0.00177097,  0.00282791,  0.00484096,
        0.00060721,  0.00101379, -0.00135691, -0.00067399,  0.00388202,
        0.00440652,  0.00448975, -0.0023262 ,  0.00370821, -0.00440187,
       -0.0039254 , -0.00063182, -0.00470547, -0.00343034,  0.00153152,
        0.00468326,  0.00431575, -0.00498626, -0.0007908 , -0.00456093,
        0.00463771,  0.00061969,  0.00096   , -0.00073173, -0.00053932,
        0.00431567,  0.00078708,  0.00277827, -0.00466654, -0.00116137,
        0.00257774, -0.00341603, -0.00271889,  0.00428807, -0.00

In [21]:
model.docvecs.count

1