## Import thư viện

In [1]:
%matplotlib inline

# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

#glob
import glob

# numpy
import numpy as np

#pandas
import pandas as pd

# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# random, itertools, matplotlib
import random
import itertools
import matplotlib.pyplot as plt



## Class đọc từng dòng trong file text
Mỗi dòng lúc này được xem như một paragraph

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc toàn bộ nội dung trong file text
Mỗi file text lúc này được xem như một paragraph
(tên class này giống hệt tên class trên, chọn 1 trong 2 class thôi)

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():
            with open (source, 'r' ,encoding="utf8") as fin:
                content=fin.read().replace('\n', '')
                self.sentences.append(LabeledSentence(utils.to_unicode(content).split(), [prefix + '_%s' % item_no]))
                item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc từng dòng 'content' trong file csv

In [2]:
class LabeledContent(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():
            
            allFiles = glob.glob(source + "/*.csv")
            series_ = pd.DataFrame()
            list_ = []
            for file_ in allFiles:
                df = pd.read_csv(file_)
                content = df['content']
                list_.append(content)
            series_ = pd.concat(list_)
            series_ = series_.drop_duplicates()
            series_ = series_.dropna()
            for row in series_:
                self.sentences.append(TaggedDocument(utils.to_unicode(row).split(), [prefix + '_%s' % item_no]))
                item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

### Xét đường dẫn tới dataset

In [3]:
# sources = {
#     'data/movie reviews/test-neg.txt':'TEST_NEG',
#     'data/movie reviews/test-pos.txt':'TEST_POS', 
#     'data/movie reviews/train-neg.txt':'TRAIN_NEG', 
#     'data/movie reviews/train-pos.txt':'TRAIN_POS',    
# }
sources = {'data':'SAMPLE',}

sentences = LabeledContent(sources)

### Xét tham số cho model, build vocabulary.

In [9]:
import multiprocessing
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
model = Doc2Vec(min_count=2, window=10, vector_size=100, sample=1e-4, negative=5, workers=cores-1, dm=0)

model.build_vocab(sentences.to_array())

num of cores is 8


### Train model

In [10]:
import timeit
start = timeit.default_timer()
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=1)
stop = timeit.default_timer()
print(stop - start)

14.266626449430362


In [11]:
model.save('model_data_news.model')

In [12]:
loaded_model = Doc2Vec.load('model_data_news.model')

### Tìm các từ tương tự với một từ được chọn

In [14]:
loaded_model.wv.most_similar('nhiệt')

[('ytcs', 0.40592342615127563),
 ('dá', 0.3979211151599884),
 ('quẫn_trí', 0.3969387412071228),
 ('cấp_tiến', 0.39341461658477783),
 ('tràng_hạt', 0.392300009727478),
 ('costanza', 0.39167165756225586),
 ('bụi_bụi', 0.38946813344955444),
 ('khuya', 0.3824516236782074),
 ('harts', 0.38170725107192993),
 ('quái_xế', 0.3792042136192322)]

### Xuất ra vector của doc đã train dựa vào tag của doc đó

In [15]:
loaded_model.docvecs['SAMPLE_0']

array([ 0.02450618, -0.01277603,  0.08255215, -0.05655874,  0.14039212,
       -0.05928926,  0.09601146,  0.12122582,  0.14726324, -0.03148405,
       -0.04873369, -0.14303754,  0.00232079,  0.07672321, -0.14094478,
        0.0592139 ,  0.04721021, -0.06020708,  0.03428698,  0.04690607,
        0.20224541,  0.02463498,  0.00194866,  0.02079237, -0.04506549,
       -0.02289934, -0.00518569,  0.0858886 , -0.06533607, -0.15777437,
       -0.03584686,  0.04869974, -0.09768421,  0.0239072 , -0.05864898,
       -0.09074295,  0.01973016, -0.01081183,  0.12436839, -0.05552965,
        0.05094841,  0.08892427,  0.00138242,  0.07599117,  0.05721422,
       -0.0428788 , -0.03158768,  0.03371916,  0.06685502,  0.02483669,
       -0.00167349, -0.20787126,  0.00564792,  0.01232569,  0.03779722,
        0.01927468, -0.06109818, -0.05005908,  0.21340863,  0.01002951,
       -0.11169552,  0.18475142,  0.14887445, -0.00089729,  0.11781806,
        0.12512626, -0.07493468,  0.05442571,  0.05339539, -0.03

### Xuất ra các doc tương tự với doc chỉ định

In [16]:
loaded_model.docvecs.most_similar('SAMPLE_0')

[('SAMPLE_37189', 0.988113522529602),
 ('SAMPLE_82711', 0.9873348474502563),
 ('SAMPLE_39920', 0.9838387966156006),
 ('SAMPLE_46189', 0.9817326068878174),
 ('SAMPLE_55599', 0.9816901683807373),
 ('SAMPLE_29760', 0.9815752506256104),
 ('SAMPLE_68119', 0.9809643030166626),
 ('SAMPLE_75809', 0.9805994033813477),
 ('SAMPLE_44396', 0.980556309223175),
 ('SAMPLE_51905', 0.9803471565246582)]

In [18]:
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

In [19]:
test_data = word_tokenize("Chiều nay ăn gì".lower())

In [20]:
test_data

['chiều', 'nay', 'ăn', 'gì']

In [34]:
new_doc = TaggedDocument(test_data, tags=['new_doc'])

In [35]:
v1 = loaded_model.infer_vector(new_doc.words)
v1

array([ 0.0389462 , -0.05965555,  0.1060019 , -0.02336759,  0.09616816,
       -0.07847093,  0.06280866,  0.05348704,  0.02787253, -0.06860881,
       -0.03053268, -0.10653654,  0.01179835,  0.05793824, -0.05669597,
        0.09096027,  0.05806056, -0.05990096,  0.04398471,  0.0586647 ,
        0.15618958,  0.06376253, -0.0460896 , -0.01008673, -0.03755061,
       -0.01706441, -0.01033904,  0.13106826, -0.03143952, -0.16352072,
       -0.01575426, -0.01906689, -0.12696606,  0.08907396,  0.01014864,
       -0.02914027,  0.04050248,  0.09609975,  0.06795964, -0.06960049,
        0.0586013 ,  0.02323061,  0.04346183,  0.05430899,  0.18923704,
       -0.16200927,  0.02303583, -0.05152023, -0.01508911,  0.03174236,
        0.02379732, -0.14491174, -0.06335623,  0.0193968 ,  0.02167826,
        0.09145959,  0.00518823, -0.01692683,  0.26429582, -0.0340397 ,
       -0.10641553,  0.10794535,  0.15381524, -0.04285296,  0.1328139 ,
        0.08968867, -0.02263072,  0.04815183,  0.04206936, -0.04

In [26]:
loaded_model.docvecs.count

86888

In [None]:
# %load C:\Users\ADMIN\Desktop\Python Tutorial 1\Doc2vec\paragraphVector.py
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import sys
import csv
import pandas as pd


def main():
    train_model()
    model = Doc2Vec.load("d2v.model")
    # to find the vector of a document which is not in training data
    test_data = word_tokenize("I love chatbots".lower())
    v1 = model.infer_vector(test_data)
    print("V1_infer", v1)

    # to find most similar doc using tags
    # similar_doc = model.docvecs.most_similar('1')
    # print(similar_doc)

    # to find vector of doc in training data using tags or in other words
    # , printing the vector of document at index 1 in training data
    # print(model.docvecs['1'])


def train_model():
    input_dir = "data/2015-news-7.1-8.31.csv"
    small_dir = "data/small.csv"
    # data = ["I love machine learning. Its awesome.",
    #         "I abc coding in python",
    #         "I hate building chatbots",
    #         "they chat amagingly well"]

    # set max field size
    csv.field_size_limit(sys.maxsize)
    # load file from csv
    documents = []
    tags = []
    # csv format id,title,content,source,create_time,get_time
    # with open(input_dir) as csvDataFile:
    #     csvReader = csv.reader(csvDataFile)
    #     next(csvReader, None)  # skip the headers
    #     for row in csvReader:
    #         documents.append(row[2])
    #         tags.append(row[0])

    # read csv using pandas

    # input_dir = "data/2015-news-7.1-8.31.csv"
    # for df in pd.read_csv(input_dir, sep=',', header=0, chunksize=5, encoding="utf-8"):
    #     tags.append(df["id"].astype(str).values[0])
    #     documents.append(df["content"].astype(str).values[0])

    # test another pandas

    df2 = pd.read_csv(input_dir, sep=',', header=0, encoding="utf-8")
    print("Finish read csv")
    tags = df2["id"].astype(str).values
    documents = df2["content"].astype(str).values

    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[tags[i]]) for i, _d in enumerate(documents)]
    print("Finish load tagged data")
    max_epochs = 10
    vec_size = 300
    alpha = 0.025

    model = Doc2Vec(size=vec_size,
                    alpha=alpha,
                    min_alpha=0.025,
                    min_count=1,
                    dm=1)
    model.build_vocab(tagged_data)
    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha

    model.save("d2v.model")
    print("Model Saved")


if "__name__": main()
