## Import thư viện

In [None]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

#glob
import glob

# numpy
import numpy as np

#pandas
import pandas as pd

# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# random, itertools, matplotlib
import random
import itertools
import matplotlib.pyplot as plt

## Class đọc từng dòng trong file text
Mỗi dòng lúc này được xem như một paragraph

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc toàn bộ nội dung trong file text
Mỗi file text lúc này được xem như một paragraph
(tên class này giống hệt tên class trên, chọn 1 trong 2 class thôi)

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():
            with open (source, 'r' ,encoding="utf8") as fin:
                content=fin.read().replace('\n', '')
                self.sentences.append(LabeledSentence(utils.to_unicode(content).split(), [prefix + '_%s' % item_no]))
                item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc từng dòng 'content' trong file csv

In [None]:
class LabeledContent(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():            
            allFiles = glob.glob(source + "/*.csv")
            #series_ = pd.DataFrame()
            #list_ = []
            for file_ in allFiles:
                df = pd.read_csv(file_)
                content = df['content']
                content = content.dropna()
                print('number documents: ',len(content))
                for row in content:
                    self.sentences.append(TaggedDocument(utils.to_unicode(row).split(), [prefix + '_%s' % item_no]))
                    item_no = item_no + 1
                #list_.append(content)
            #series_ = pd.concat(list_)
            #series_ = series_.drop_duplicates()
            #series_ = series_.dropna()
            #for row in series_:
                #self.sentences.append(TaggedDocument(utils.to_unicode(row).split(), [prefix + '_%s' % item_no]))
                #item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

### Xét đường dẫn tới dataset

In [None]:
# sources = {
#     'data/movie reviews/test-neg.txt':'TEST_NEG',
#     'data/movie reviews/test-pos.txt':'TEST_POS', 
#     'data/movie reviews/train-neg.txt':'TRAIN_NEG', 
#     'data/movie reviews/train-pos.txt':'TRAIN_POS',    
# }
sources = {'data/a3':'SAMPLE'}

sentences = LabeledContent(sources)

### Xét tham số cho model, build vocabulary.

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
model = Doc2Vec(min_count=2, window=10, vector_size=100, sample=1e-4, negative=5, workers=cores-1, dm=0)
tagged_sentences = sentences.to_array()
model.build_vocab(tagged_sentences)

### Train model

In [None]:
import timeit
start = timeit.default_timer()
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=1)
stop = timeit.default_timer()
print(stop - start)

In [None]:
model.save('SAMPLE_model.d2v')

In [None]:
loaded_model = Doc2Vec.load('SAMPLE_model.d2v')

In [None]:
sources = {'data/a2':'SAMPLEa2'}
sentences = LabeledContent(sources)
sentences2array = sentences.to_array()

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
#loaded_model = Doc2Vec(min_count=2, window=10, vector_size=100, sample=1e-4, negative=5, workers=cores-1, dm=0)

loaded_model.build_vocab(sentences2array,update=True)

In [None]:
len(sentences2array)

In [None]:
import timeit
start = timeit.default_timer()
loaded_model.train(sentences.sentences_perm(),total_examples=len(sentences2array), epochs=1)
stop = timeit.default_timer()
print(stop - start)

### Tìm các từ tương tự với một từ được chọn

In [None]:
loaded_model.wv.most_similar('vui')

### Xuất ra vector của doc đã train dựa vào tag của doc đó

In [None]:
loaded_model.docvecs['SAMPLEa1_0'].

### Xuất ra các doc tương tự với doc chỉ định

In [None]:
similar_doc = loaded_model.docvecs.most_similar('SAMPLE_0')

In [None]:
similar_doc

In [None]:
tagged_sentences[0]

In [None]:
tagged_sentences = sentences.to_array()

In [None]:
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

In [None]:
test_data = word_tokenize("Chiều nay ăn gì".lower())

In [None]:
test_data

In [None]:
new_doc = TaggedDocument(test_data, tags=['new_doc'])

In [None]:
v1 = loaded_model.infer_vector(new_doc.words)
v1

In [None]:
loaded_model.docvecs.count