# Jupyter Notebook for demonstrating Wikipedia2Vec: A tool for learning vector representations of words and entities from Wikipedia

## 1. Import necessary packages

In [1]:
import functools
import logging
import multiprocessing
import os
import pkg_resources
import io
import numpy as np
from tempfile import NamedTemporaryFile
from wikipedia2vec.dump_db import DumpDB
from wikipedia2vec.dictionary import Dictionary, Item, Word, Entity
from wikipedia2vec.link_graph import LinkGraph
from wikipedia2vec.mention_db import MentionDB
from wikipedia2vec.wikipedia2vec import Wikipedia2Vec
from wikipedia2vec.utils.wiki_dump_reader import WikiDumpReader
from wikipedia2vec.utils.tokenizer import get_tokenizer, get_default_tokenizer
from wikipedia2vec.utils.sentence_detector import get_sentence_detector
from Wikipedia2vec import Wikipedia2vec

## 2. Read the dataset
Reads a dataset in preparation to learn embeddings. Returns data in proper format to learn embeddings. Saves the required file names in chosen_dataset.txt to be used by other methods of the program

In [None]:
def read_dataset():
    train = np.genfromtxt("data/Yago/train.txt", delimiter='\t', dtype='str', usecols=np.arange(0,3))
    valid = np.genfromtxt("data/Yago/valid.txt", delimiter='\t', dtype='str', usecols=np.arange(0,3))
    test = np.genfromtxt("data/Yago/test.txt", delimiter='\t', dtype='str', usecols=np.arange(0,3))
    entity2id = np.genfromtxt("data/Yago/entity2id.txt", delimiter='\t', dtype='str', usecols=np.arange(0,2))
    relation2id = np.genfromtxt("data/Yago/relation2id.txt", delimiter='\t', dtype='str', usecols=np.arange(0,2))
    file_name = 'yago.txt'
    with open('chosen_dataset.txt', 'w') as the_file:
        the_file.write(file_name)
    the_file.close()
    

In [None]:
read_dataset()

## 3. Build Dump Database
Wikipedia2vec dump can be obtained using wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2. It takes around 2 hours to download.

The build_dump command creates a database that contains Wikipedia pages each of which consists of texts and anchor links in it.

In [None]:
def build_dump(dump_file, out_file):
    dump_reader = WikiDumpReader(dump_file)
    DumpDB.build(dump_reader, out_file, pool_size=multiprocessing.cpu_count(), chunk_size=100)

In [None]:
dump_file = 'enwiki-latest-pages-articles.xml.bz2'

In [None]:
build_dump(dump_file, 'output.db')

## 4. Build Dictionary

The build_dictionary command builds a dictionary of words and entities.

In [None]:
def build_dictionary(dump_db_file, out_file):
    dump_db = DumpDB(dump_db_file)
    dictionary = Dictionary.build(
        dump_db=dump_db,
        tokenizer=get_default_tokenizer(dump_db.language),
        category=False,
        lowercase= True,
        min_entity_count=5,
        min_paragraph_len=5,
        pool_size=multiprocessing.cpu_count(),
        disambi=False,
        chunk_size=100,
        min_word_count=5)
    dictionary.save(out_file)

In [None]:
build_dictionary('output.db', 'output_dic')

## 5. Build Mention DB 

The build_mention_db command builds a database that contains the mappings of entity names (mentions) and their possible referent entities.

In [None]:
def build_mention_db(dump_db_file, dictionary_file, out_file):
    dump_db = DumpDB(dump_db_file)
    dictionary = Dictionary.load(dictionary_file)
    mention_db = MentionDB.build(dump_db, dictionary,
        tokenizer=get_default_tokenizer(dump_db.language),
        min_link_prob=0.2,
        min_prior_prob=0.01,
        pool_size=multiprocessing.cpu_count(),
        max_mention_len=20,
        chunk_size=100,
        case_sensitive=False)
    mention_db.save(out_file)

In [None]:
build_mention_db('output.db', 'output_dic', 'output_md')

## 6. Build Link Graph

The build_link_graph command generates a sparse matrix representing the link structure between Wikipedia entities.

In [2]:
def build_link_graph(dump_db_file, dictionary_file, out_file):
    dump_db = DumpDB(dump_db_file)
    dictionary = Dictionary.load(dictionary_file)
    link_graph = LinkGraph.build(dump_db, dictionary, pool_size=multiprocessing.cpu_count(), chunk_size=100)
    link_graph.save(out_file)

In [3]:
build_link_graph('output.db', 'output_dic', 'output_lg')

100%|██████████| 24000/24000 [00:03<00:00, 6363.71it/s]


## 7. Learn Embeddings

The learn_embeddings command runs the training of the embeddings.

In [4]:
def learn_embeddings(dump_db_file, dictionary_file, out_file, link_graph_file=None, mention_db_file=None):
    dump_db = DumpDB(dump_db_file)
    dictionary = Dictionary.load(dictionary_file)

    link_graph = LinkGraph.load(link_graph_file, dictionary) if link_graph_file else None
    mention_db = MentionDB.load(mention_db_file, dictionary) if mention_db_file else None

    wiki2vec = Wikipedia2Vec(dictionary)
    wiki2vec.train(dump_db, link_graph, mention_db,
        tokenizer=get_default_tokenizer(dump_db.language),
        sentence_detector=None,
        entity_neg_power=0.0,
        entities_per_page=10,
        dim_size=100,
        iteration=5,
        negative=5,
        pool_size=multiprocessing.cpu_count(),
        sample=0.0001,
        window=5,
        chunk_size=100,
        init_alpha=0.025,
        min_alpha=0.0001,
        word_neg_power=0.75)

    wiki2vec.save(out_file)

In [5]:
learn_embeddings('output.db', 'output_dic', 'final_output', link_graph_file='output_lg', mention_db_file='output_md')

KeyboardInterrupt: 

## 8. Save Model

The save_model command outputs the model in a text format.

In [None]:
def save_model(model_file, out_file, out_format='default'):
    wiki2vec = Wikipedia2Vec.load(model_file)
    wiki2vec.save_text(out_file, out_format)

In [None]:
save_model('final_output', 'final_output_text')

## 9. Load Model

The load_model command is used to load a pretrained model.

In [6]:
obj = Wikipedia2vec()
obj.load_model()

array([['Constitution Party (United States)',
        '1.4314 0.6509 0.1369 -0.2897 0.9381 -0.7960 -0.2041 0.1075 -0.0845 -0.9855 1.0325 2.1194 0.0237 0.2636 0.2425 1.0091 0.5908 0.7337 1.0987 -0.5816 1.5382 0.0214 -0.6255 0.2021 -1.5465 0.6726 -0.0227 0.0372 -0.7585 -0.1627 0.5472 -0.2056 0.6648 -0.3341 -0.7990 0.7560 0.6875 -1.2692 -0.8705 0.3443 -0.9969 0.8561 -0.2194 1.7175 1.6140 -1.2135 -0.0415 0.9827 1.0512 -1.0306 -0.3996 0.6865 -0.3193 0.3552 -0.0496 1.5362 0.4726 -1.1567 -0.1246 -0.4665 0.5021 0.5424 -0.6211 -0.0685 -1.1872 -0.0262 0.1155 -0.5210 -0.6172 1.2290 -0.3012 -0.1781 -0.1648 -0.2855 1.1114 0.1631 0.5178 0.2111 -0.6711 -0.2107 1.5018 0.6680 1.2189 0.4314 0.4679 0.0888 -0.3454 0.4530 0.5873 0.2863 -0.1899 0.2594 0.3325 0.1133 0.2565 0.7823 -0.7421 0.0873 0.0167 1.3978'],
       ['Charles Kennedy',
        '0.8849 0.2273 -0.0471 0.1152 0.7337 -0.4782 -0.5643 -0.8389 -0.4170 -0.8331 1.6839 1.3857 0.0676 -0.0253 0.5190 0.2236 0.0446 0.3481 0.5023 0.1022 -0.0678 -0.6835 -

## 10. Evaluation

Evaluation metric used is Spearman's coefficient using cosine similarity

In [7]:
obj.evaluate()

0.6255656104786236