## Introduction:
Obtain embedding representation of articles in the dataset

In [1]:
import os
import sys
import pandas as pd
import utils
import gensim
import sklearn
import numpy as np
from gensim.models.doc2vec import Doc2Vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
from nltk import word_tokenize

In [2]:
def cut_drop_stop_words(content):
    '''
    Tokenize the doc to words
    Get the stop words from txt and drop them form articles
    :return: a cut word list
    '''
    
    stop_words = []
    with open('./en_stopwords.txt', 'r', encoding="latin-1") as f_reader:
        for line in f_reader:
            line = line.replace("\r", "").replace("\n", "")
            stop_words.append(line)
    stop_words.append('●')
    stop_words.append(',')
    stop_words = set(stop_words)
    
    if content != '' and content is not None:
        seg_list = word_tokenize(content)
        each_split = ' '.join(seg_list).split()
        each_result = [word for word in each_split if word not in stop_words] #drop stop words
        result = ' '.join(each_result)
    return result

def get_file_data_to_a_list(file_list):
    '''
    Read all article paths into a list of file paths, and read the data in each file in turn, saving all data into one file
    '''
    file_paths = []
    data = []
    for idx,item in enumerate(file_list):
        file_paths.append('./plaintext_articles/' + item + '.txt')
    
    for filename in file_paths:
        f = open(filename,'r',encoding="latin-1")
        doc = f.read().replace("\t", "").replace("\n", "")
        data.append(cut_drop_stop_words(doc))

    return data

def get_cut_dataset(data):
    '''
    Generate corpus(using tokenized word list)
    '''
    corpus = []
    documents = []

    for idx, item in enumerate(data):
        text = list(item.replace('\n', '').split(' '))
        # print(text)
        document = TaggededDocument(text, tags=[idx])
        corpus.append(document)
    print('len of corpus：', len(corpus))

    return corpus


def train(x_train, model_path , size=300, epoch_num=20, dm=1):
    print('start train')
    model_dm = Doc2Vec(x_train, min_count=10, window=5, vector_size=size, sample=1e-3, negative=5, workers=4, dm=dm)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=epoch_num)
    model_dm.save(model_path)
    print('end train')
    return model_dm


def test_(model_path, str):
    model_dm = Doc2Vec.load(model_path)
    test_text = ' '.join(word_tokenize(str)).split(' ')
    inferred_vector_dm = model_dm.infer_vector(test_text)
    print('inferred_vector_dm:', inferred_vector_dm)
    sims = model_dm.dv.most_similar([inferred_vector_dm], topn=1)
    return sims

In [3]:
links_paths_path = 'wikispeedia_paths-and-graph/'
articles_df = pd.read_csv(links_paths_path + 'articles.tsv', sep='\t',\
                                  names = ['article'],  skiprows=12)
articles_list = articles_df['article'].tolist()

In [4]:
model_path = './article2vec'
temp = get_file_data_to_a_list(articles_list)
train_corpus = get_cut_dataset(temp)
model_ = train(train_corpus, model_path=model_path)

len of corpus： 4604
start train
end train


In [5]:
# Use an article to test whether that the most similar one is itself
f = open('./plaintext_articles/United_States.txt','r',encoding="latin-1")
test = f.read().replace("\t", "").replace("\n", "")

sims = test_(model_path, test)
for count,sim in sims:
    sentence = train_corpus[count]
    words = ''
    for word in sentence[0][:200]:
        words = words + word + ' '
    print(words, sim)

inferred_vector_dm: [-2.1838787  -0.48399806  0.15809946 -0.14729244  0.34973332 -1.4335711
 -0.4967516   1.5664557  -1.0084097   1.0260029  -0.50888044 -0.06240981
 -1.6091268   0.9134005  -1.97776    -1.0039107  -0.50500184  0.8283831
  1.4146003  -1.169062    0.16794592 -1.0245419   0.62139326 -1.4466877
 -1.1098875   0.8930133  -1.6181768  -0.8313622  -1.494734    0.3323823
 -0.12109297 -1.0231631  -2.0349224  -1.038635   -1.0756977  -0.711748
 -0.9778016   0.5457843   1.5275923  -2.4995084  -0.9455212  -1.1861526
  0.4635846   1.4584116   1.3225995  -1.0851151  -1.101253   -0.24172391
  0.82580477 -1.0221933  -1.0289146   2.6091213  -0.21463059 -0.89533377
  1.9647735   0.53621775 -0.03462055 -0.18051496 -0.5938423  -0.74684364
  0.42481962  0.0447261  -0.56526184  0.39519164  0.78881365 -0.5501906
 -0.8492291  -1.2902513   2.1995742  -0.92629135 -0.893451   -0.62538654
  0.72446316  0.4102887  -1.0334884  -0.926342    0.23691113 -1.5584357
  0.44277218  1.8878403  -0.5449402  -0.

In [6]:
path = "./plaintext_articles" 
files= os.listdir(path) 
txts = []
files.sort()

In [7]:
for file in files: 
    if file != '.ipynb_checkpoints':
        position = path+'/'+ file 
        with open(position, "r",encoding='utf-8') as f:    
            data = f.read()   
            txts.append(data)

In [8]:
data = np.zeros((len(txts),model_.vector_size))
for i,item in enumerate(txts):
    text = list(item.split(' ') if type(item) == str else str(item))
    data[i] = model_.infer_vector(text)

np.save("./Doc2VecArray", data)

print("save to ./Doc2VecArray")

save to ./Doc2VecArray
