## Import thư viện

In [1]:
%matplotlib inline

# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

#glob
import glob

# numpy
import numpy as np

#pandas
import pandas as pd

# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# random, itertools, matplotlib
import random
import itertools
import matplotlib.pyplot as plt



## Class đọc từng dòng trong file text
Mỗi dòng lúc này được xem như một paragraph

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc toàn bộ nội dung trong file text
Mỗi file text lúc này được xem như một paragraph
(tên class này giống hệt tên class trên, chọn 1 trong 2 class thôi)

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():
            with open (source, 'r' ,encoding="utf8") as fin:
                content=fin.read().replace('\n', '')
                self.sentences.append(LabeledSentence(utils.to_unicode(content).split(), [prefix + '_%s' % item_no]))
                item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Class đọc từng dòng 'content' trong file csv

In [2]:
class LabeledContent(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def to_array(self):
        self.sentences = []
        item_no = 0
        for source, prefix in self.sources.items():
            
            allFiles = glob.glob(source + "/*.csv")
            series_ = pd.DataFrame()
            list_ = []
            for file_ in allFiles:
                df = pd.read_csv(file_)
                content = df['content']
                list_.append(content)
            series_ = pd.concat(list_)
            series_ = series_.drop_duplicates()
            series_ = series_.dropna()
            for row in series_:
                self.sentences.append(TaggedDocument(utils.to_unicode(row).split(), [prefix + '_%s' % item_no]))
                item_no = item_no + 1
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

### Xét đường dẫn tới dataset

In [3]:
# sources = {
#     'data/movie reviews/test-neg.txt':'TEST_NEG',
#     'data/movie reviews/test-pos.txt':'TEST_POS', 
#     'data/movie reviews/train-neg.txt':'TRAIN_NEG', 
#     'data/movie reviews/train-pos.txt':'TRAIN_POS',    
# }
sources = {'data/a1':'SAMPLEa1',}

sentences = LabeledContent(sources)

### Xét tham số cho model, build vocabulary.

In [4]:
import multiprocessing
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
model = Doc2Vec(min_count=2, window=10, vector_size=100, sample=1e-4, negative=5, workers=cores-1, dm=0)

model.build_vocab(sentences.to_array())

num of cores is 8


In [None]:
model.tra

### Train model

In [5]:
import timeit
start = timeit.default_timer()
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=1)
stop = timeit.default_timer()
print(stop - start)

0.01043317281994316


In [6]:
model.save('model_data_news.model')

In [3]:
loaded_model = Doc2Vec.load('model_data_news.model')

In [4]:
sources = {'data/a2':'SAMPLEa2',}

sentences = LabeledContent(sources)

In [5]:
import multiprocessing
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
#loaded_model = Doc2Vec(min_count=2, window=10, vector_size=100, sample=1e-4, negative=5, workers=cores-1, dm=0)

loaded_model.build_vocab(sentences.to_array(),update=True)

num of cores is 8


In [None]:
import timeit
start = timeit.default_timer()
loaded_model.train(sentences.sentences_perm(),total_examples=loaded_model.corpus_count, epochs=1)
stop = timeit.default_timer()
print(stop - start)

### Tìm các từ tương tự với một từ được chọn

In [1]:
loaded_model.wv.most_similar('vui')

NameError: name 'loaded_model' is not defined

### Xuất ra vector của doc đã train dựa vào tag của doc đó

In [None]:
loaded_model.docvecs['SAMPLEa1_0']

### Xuất ra các doc tương tự với doc chỉ định

In [None]:
loaded_model.docvecs.most_similar('SAMPLE_0')

In [None]:
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

In [None]:
test_data = word_tokenize("Chiều nay ăn gì".lower())

In [None]:
test_data

In [None]:
new_doc = TaggedDocument(test_data, tags=['new_doc'])

In [None]:
v1 = loaded_model.infer_vector(new_doc.words)
v1

In [None]:
loaded_model.docvecs.count

In [None]:
# %load C:\Users\ADMIN\Desktop\Python Tutorial 1\Doc2vec\paragraphVector.py
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import sys
import csv
import pandas as pd


def main():
    train_model()
    model = Doc2Vec.load("d2v.model")
    # to find the vector of a document which is not in training data
    test_data = word_tokenize("I love chatbots".lower())
    v1 = model.infer_vector(test_data)
    print("V1_infer", v1)

    # to find most similar doc using tags
    # similar_doc = model.docvecs.most_similar('1')
    # print(similar_doc)

    # to find vector of doc in training data using tags or in other words
    # , printing the vector of document at index 1 in training data
    # print(model.docvecs['1'])


def train_model():
    input_dir = "data/2015-news-7.1-8.31.csv"
    small_dir = "data/small.csv"
    # data = ["I love machine learning. Its awesome.",
    #         "I abc coding in python",
    #         "I hate building chatbots",
    #         "they chat amagingly well"]

    # set max field size
    csv.field_size_limit(sys.maxsize)
    # load file from csv
    documents = []
    tags = []
    # csv format id,title,content,source,create_time,get_time
    # with open(input_dir) as csvDataFile:
    #     csvReader = csv.reader(csvDataFile)
    #     next(csvReader, None)  # skip the headers
    #     for row in csvReader:
    #         documents.append(row[2])
    #         tags.append(row[0])

    # read csv using pandas

    # input_dir = "data/2015-news-7.1-8.31.csv"
    # for df in pd.read_csv(input_dir, sep=',', header=0, chunksize=5, encoding="utf-8"):
    #     tags.append(df["id"].astype(str).values[0])
    #     documents.append(df["content"].astype(str).values[0])

    # test another pandas

    df2 = pd.read_csv(input_dir, sep=',', header=0, encoding="utf-8")
    print("Finish read csv")
    tags = df2["id"].astype(str).values
    documents = df2["content"].astype(str).values

    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[tags[i]]) for i, _d in enumerate(documents)]
    print("Finish load tagged data")
    max_epochs = 10
    vec_size = 300
    alpha = 0.025

    model = Doc2Vec(size=vec_size,
                    alpha=alpha,
                    min_alpha=0.025,
                    min_count=1,
                    dm=1)
    model.build_vocab(tagged_data)
    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha

    model.save("d2v.model")
    print("Model Saved")


if "__name__": main()
