In [1]:
import json

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
import gensim

In [4]:
import sklearn

### Building a corpus

In [19]:
corpus_en = []

In [4]:
### AAT 
with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_en.json','r') as jf:
    aat_bows_en = json.load(jf)

In [22]:
def extend_corpus(existing_corpus:list, sorce_path:str) -> list:
    '''
    Adds search results from AAT, PWN, or Wikidata to the existing corpus
    Every unique search result (hit) is added to the corpus as a list of tokens (bow)
    NB! Modifies existing_corpus adding new lists
    Returns a str with info about how many new documents were added
    existing_corpus: list, 
    sorce_path: str, a path to search results (a json file) to extend the existing corpus with
    requires json library
    '''
    
    added_corpus = []
    
    # reading source
    with open(sorce_path,'r') as jf:
        sorce_bows = json.load(jf)
    
    # this list stores unique bow IDs
    # taking only unique entities, so there are no duplicate BoWs in the corpus
    ids_list = []
    for value in sorce_bows.values():
        for v in value:
            for id_value, bow in v.items():
                if id_value not in ids_list:
                    added_corpus.append(bow)
                    ids_list.append(id_value)
                    
    existing_corpus.extend(added_corpus)
    
    return (f"{len(added_corpus)} bows added")

In [27]:
extend_corpus(corpus_en,"/Users/anesterov/wd/jan31/wd_bows_en.json")

'169284 bows added'

In [28]:
len(corpus_en)

179304

### Building a test corpus (10% of corpus_en)

In [29]:
import random

In [33]:
test_corpus_en = []

idx_all = []

idx_aat = random.sample(range(0, 5000), 500)
idx_pwn = random.sample(range(6000, 10000), 500)
idx_wikidata = random.sample(range(10500, 160000), 17000)

idx_all.extend(idx_aat)
idx_all.extend(idx_pwn)
idx_all.extend(idx_wikidata)

In [35]:
for i in idx_all:
    test_corpus_en.append(corpus_en[i])

In [37]:
len(test_corpus_en)

18000

### Tagging training corpus

In [38]:
def tagging_corpus(corpus:list) -> list:
    '''
    Adding tags to corpus using gensim TaggedDocument
    Returns tagged corpus
    '''
    tagged_corpus = []
    
    tag = 0
    for bow in corpus:
        tag += 1
        tagged_corpus.append(gensim.models.doc2vec.TaggedDocument(bow, [tag]))
        
    return tagged_corpus

In [39]:
tagged_corpus_en = tagging_corpus(corpus_en)

In [40]:
tagged_corpus_en[1]

TaggedDocument(words=['chinese', 'coromandel', 'screen', 'refers', 'type', 'chinese', 'folding', 'screen', 'typically', 'large', 'tall', 'ten', 'foot', 'height', 'twenty', 'length', 'twelve', 'panel', 'lacquered', 'gilded', 'coromandel', 'screen', 'feature', 'incised', 'lacquer', 'decoration', 'often', 'wide', 'border', 'around', 'main', 'name', 'come', 'part', 'southeastern', 'indian', 'coast', 'near', 'madras', 'transfer', 'point', 'far', 'eastern', 'good', 'shipped', 'europe', 'east', 'india', 'coromandel', 'screen', 'mainly', 'made', 'china', 'century', 'european', 'chinese', 'coromandel', 'screen', 'chinese', 'coromandels', 'chinese', 'coromandel', 'coromandel', 'screen', 'coromandel', 'lacquer', 'screen', 'kuancai', 'screen'], tags=[2])

### Training the Doc2Vec model

In [41]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

2023-03-27 18:24:01,123 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>', 'datetime': '2023-03-27T18:24:01.087825', 'gensim': '4.3.0', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [42]:
model.build_vocab(tagged_corpus_en)

2023-03-27 18:24:23,957 : INFO : collecting all words and their counts
2023-03-27 18:24:23,958 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-03-27 18:24:24,040 : INFO : PROGRESS: at example #10000, processed 229988 words (2909127 words/s), 29763 word types, 0 tags
2023-03-27 18:24:24,065 : INFO : PROGRESS: at example #20000, processed 342363 words (4574504 words/s), 38122 word types, 0 tags
2023-03-27 18:24:24,087 : INFO : PROGRESS: at example #30000, processed 421303 words (3639632 words/s), 48651 word types, 0 tags
2023-03-27 18:24:24,111 : INFO : PROGRESS: at example #40000, processed 532096 words (4758306 words/s), 55023 word types, 0 tags
2023-03-27 18:24:24,135 : INFO : PROGRESS: at example #50000, processed 625950 words (3983566 words/s), 64293 word types, 0 tags
2023-03-27 18:24:24,158 : INFO : PROGRESS: at example #60000, processed 715488 words (4111955 words/s), 68767 word types, 0 tags
2023-03-27 18:24:24,181 : INFO : PROGRESS: at

In [49]:
# how many times a certain word appears in the corpus
model.wv.get_vecattr('black', 'count')

11747

In [50]:
model.train(tagged_corpus_en, total_examples=model.corpus_count, epochs=model.epochs)

2023-03-27 18:48:59,965 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 57097 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-03-27T18:48:59.965473', 'gensim': '4.3.0', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'train'}
2023-03-27 18:49:01,020 : INFO : EPOCH 0 - PROGRESS: at 14.47% examples, 364539 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:49:02,026 : INFO : EPOCH 0 - PROGRESS: at 32.26% examples, 329466 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:49:03,027 : INFO : EPOCH 0 - PROGRESS: at 49.73% examples, 318736 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:49:04,068 : INFO : EPOCH 0 - PROGRESS: at 67.18% examples, 311836 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:49:05,069 : INFO : EPOCH 0 - PROGRESS: at 86.23% examples, 305238 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:49:05,838 : INFO : EPOC

2023-03-27 18:50:05,320 : INFO : EPOCH 11 - PROGRESS: at 33.01% examples, 339257 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:50:06,345 : INFO : EPOCH 11 - PROGRESS: at 50.81% examples, 327103 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:50:07,347 : INFO : EPOCH 11 - PROGRESS: at 68.26% examples, 321371 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:50:08,373 : INFO : EPOCH 11 - PROGRESS: at 87.51% examples, 311110 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:50:09,022 : INFO : EPOCH 11: training on 1877097 raw words (1778961 effective words) took 5.7s, 311554 effective words/s
2023-03-27 18:50:10,046 : INFO : EPOCH 12 - PROGRESS: at 15.27% examples, 381695 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:50:11,099 : INFO : EPOCH 12 - PROGRESS: at 34.50% examples, 334617 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:50:12,111 : INFO : EPOCH 12 - PROGRESS: at 52.89% examples, 334592 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:50:13,112 : INFO : EPOCH 12 - PROGRESS: at 70.70% exa

2023-03-27 18:51:12,140 : INFO : EPOCH 22: training on 1877097 raw words (1779277 effective words) took 5.7s, 309528 effective words/s
2023-03-27 18:51:13,203 : INFO : EPOCH 23 - PROGRESS: at 16.05% examples, 376681 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:51:14,207 : INFO : EPOCH 23 - PROGRESS: at 34.50% examples, 336141 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:51:15,237 : INFO : EPOCH 23 - PROGRESS: at 50.72% examples, 320567 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:51:16,283 : INFO : EPOCH 23 - PROGRESS: at 68.93% examples, 315021 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:51:17,346 : INFO : EPOCH 23 - PROGRESS: at 88.72% examples, 306300 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:51:17,933 : INFO : EPOCH 23: training on 1877097 raw words (1777982 effective words) took 5.8s, 307014 effective words/s
2023-03-27 18:51:18,962 : INFO : EPOCH 24 - PROGRESS: at 15.27% examples, 380095 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:51:20,013 : INFO : EPOCH 24 - PR

2023-03-27 18:52:19,769 : INFO : EPOCH 34 - PROGRESS: at 68.93% examples, 319387 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:52:20,771 : INFO : EPOCH 34 - PROGRESS: at 89.28% examples, 315427 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:52:21,340 : INFO : EPOCH 34: training on 1877097 raw words (1779113 effective words) took 5.7s, 314458 effective words/s
2023-03-27 18:52:22,351 : INFO : EPOCH 35 - PROGRESS: at 14.47% examples, 376837 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:52:23,374 : INFO : EPOCH 35 - PROGRESS: at 33.01% examples, 335328 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:52:24,385 : INFO : EPOCH 35 - PROGRESS: at 50.81% examples, 326069 words/s, in_qsize 5, out_qsize 0
2023-03-27 18:52:25,453 : INFO : EPOCH 35 - PROGRESS: at 68.93% examples, 317439 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:52:26,465 : INFO : EPOCH 35 - PROGRESS: at 88.72% examples, 311276 words/s, in_qsize 6, out_qsize 0
2023-03-27 18:52:27,074 : INFO : EPOCH 35: training on 1877097 raw 

In [8]:
from gensim.test.utils import get_tmpfile

2023-03-30 13:43:04,303 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-03-30 13:43:04,304 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2023-03-30 13:43:04,340 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2023-03-30T13:43:04.305690', 'gensim': '4.3.0', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [6]:
from gensim.models.doc2vec import Doc2Vec

In [10]:
fname = get_tmpfile("doc2vec_for_literals_disambiguation")

In [12]:
fname

'/var/folders/hh/6wcbz37550q71080d_57dlyw0000gn/T/tmpgqs075ed/doc2vec_for_literals_disambiguation'

In [123]:
model.save(fname)

2023-03-27 19:34:22,967 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '/var/folders/hh/6wcbz37550q71080d_57dlyw0000gn/T/tmpny_bpeoi/doc2vec_for_literals_disambiguation', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-27T19:34:22.967074', 'gensim': '4.3.0', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
2023-03-27 19:34:22,968 : INFO : not storing attribute cum_table
2023-03-27 19:34:23,027 : INFO : saved /var/folders/hh/6wcbz37550q71080d_57dlyw0000gn/T/tmpny_bpeoi/doc2vec_for_literals_disambiguation


In [13]:
model = Doc2Vec.load(fname)

2023-03-30 13:43:39,926 : INFO : loading Doc2Vec object from /var/folders/hh/6wcbz37550q71080d_57dlyw0000gn/T/tmpgqs075ed/doc2vec_for_literals_disambiguation


FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/hh/6wcbz37550q71080d_57dlyw0000gn/T/tmpgqs075ed/doc2vec_for_literals_disambiguation'

In [52]:
model.infer_vector(['black', 'people', 'art'])

array([ 0.10006735, -0.09099598,  0.18476892,  0.05892054,  0.01515821,
       -0.00475166, -0.38807815, -0.13774407, -0.19049756,  0.17214076,
       -0.04740298, -0.13652608, -0.20511973, -0.16564879, -0.04708961,
       -0.11566802,  0.16306467,  0.07564908, -0.07143099,  0.11591461,
       -0.08985341,  0.11859614,  0.19277763,  0.1120984 ,  0.10341622,
        0.1801612 ,  0.10100152, -0.3975583 ,  0.05903649,  0.15550284,
       -0.13116123,  0.02946616,  0.15778813,  0.38174975, -0.03603979,
        0.03754644,  0.17478575,  0.27618977,  0.2038919 ,  0.360196  ,
        0.10085315,  0.44833648, -0.06487529,  0.11565264, -0.03100512,
       -0.25806347,  0.05614974, -0.03233728, -0.48763245, -0.02308856],
      dtype=float32)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
# related matches
with open('/Users/anesterov/reps/LODlit/bg/rm_bows_all.json','r') as jf:
    rm_bows = json.load(jf)

In [57]:
rm_black = []
rm_black.extend(rm_bows["en"]["black"]["wikidata"])
rm_black.extend(rm_bows["en"]["black"]["aat"])
rm_black.extend(rm_bows["en"]["black"]["pwn"])

In [59]:
black = list(set(rm_black))

In [60]:
experiment_black = ['people', 'france', 'human', 'population', 'african', 'immigration', 'france', 'racial', 'multi', 'ethnic', 'group']

In [62]:
cosine_similarity(model.infer_vector(black).reshape(1, -1), model.infer_vector(experiment_black).reshape(1, -1))

array([[0.36727932]], dtype=float32)

In [72]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus_en) - 1)
inferred_vector = model.infer_vector(test_corpus_en[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus_en[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_corpus_en[sims[index][0]].words)))

Test Document (6142): «primitive strain short film film arthur berthelet»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (158216, 0.772796630859375): «back primitive short film film francis boggs otis turner»

MEDIAN (165660, 0.19055573642253876): «toyopán indigenous chieftaincy costa rica»

LEAST (115769, -0.5890066027641296): «electric funeral original song written composed performed black sabbath»



In [120]:
sims

[(158216, 0.772796630859375),
 (158282, 0.7629953026771545),
 (158484, 0.7434939742088318),
 (9737, 0.7417248487472534),
 (158468, 0.735752284526825),
 (9734, 0.7290006875991821),
 (158167, 0.727756142616272),
 (158217, 0.7245995998382568),
 (157987, 0.7135770320892334),
 (158211, 0.7120720148086548),
 (158187, 0.709780216217041),
 (158235, 0.7083865404129028),
 (158226, 0.7068758606910706),
 (158288, 0.6905967593193054),
 (158190, 0.684939980506897),
 (158179, 0.6816670298576355),
 (158269, 0.6815990209579468),
 (9834, 0.6755386590957642),
 (9771, 0.6667096018791199),
 (158672, 0.6601245999336243),
 (158207, 0.6588665246963501),
 (9793, 0.6550884246826172),
 (9751, 0.6447477340698242),
 (158328, 0.6410449743270874),
 (9766, 0.6346874833106995),
 (158718, 0.627562403678894),
 (158029, 0.6110966801643372),
 (158174, 0.6094334125518799),
 (158233, 0.6085729598999023),
 (9736, 0.6081785559654236),
 (158240, 0.6054738759994507),
 (158082, 0.6022983193397522),
 (4517, 0.6010955572128296),
 

In [73]:
import scipy

In [74]:
from scipy import spatial

In [100]:
exp_2 = ['people', 'race', 'human', 'categorization', 'person', 'color', 'man', 'woman', 'racialized', 'classification', 'people', 'usually', 'political', 'skin', 'color', 'based', 'category', 'specific', 'population', 'mid', 'dark', 'brown', 'complexion']

In [101]:
black_exp_2 = list(set(exp_2))

In [113]:
exp_3 = ['friday', 'song', 'rebecca']

In [115]:
exp_4 = ['unicode', 'medium', 'right', 'pointing', 'triangle', 'centred', 'character']

In [119]:
1 - spatial.distance.cosine(model.infer_vector(black), model.infer_vector(exp_4))

0.15534548461437225

In [131]:
len(set(ranks))

105686

In [132]:
#### Checking the test set

In [134]:
model.dv[1]

array([-0.3839839 , -1.0829922 ,  0.7284176 ,  0.5750398 ,  1.2946204 ,
        0.345347  ,  0.29598656,  0.41384223,  0.2510188 ,  0.03540819,
        0.05011204, -0.59316385,  0.12808923,  0.6006127 ,  0.18617561,
        1.1059247 ,  0.65699714,  0.078219  ,  0.01079204,  0.29751384,
        0.54070127,  0.42675695,  0.03375255,  0.42763433, -0.23321885,
       -0.09180884, -0.22085252,  0.3998829 ,  0.36516744, -0.50999606,
       -0.00400811, -0.2572781 , -1.4095058 ,  0.76877743, -0.09868379,
        0.9310032 , -0.22179867,  0.5913735 , -0.04960841,  0.45175368,
       -0.54530907,  0.04237929, -0.56336325,  0.13590619,  0.51679635,
        0.05895659,  0.03372746,  0.39298773,  0.21641119, -0.02584014],
      dtype=float32)

In [154]:
n_similar = []
for bow in test_corpus_en:
    inferred_vector = model.infer_vector(bow)
    most_similar_score = model.dv.most_similar([inferred_vector], topn=1)
    most_similar = tagged_corpus_en[most_similar_score[0][0]].words
    if bow == most_similar:
        n_similar.append(most_similar_score[0][0])

In [155]:
acc = len(n_similar) / len(test_corpus_en) * 100

In [157]:
len(n_similar)

205

In [208]:
len(test_corpus_en)

18000

In [165]:
related_match = model.infer_vector(['aboriginal','indigenous', 'people', 'population', 'group', 'people', 'ethnic', 'group', 'minority'])

In [166]:
for top in model.dv.most_similar([related_match], topn=10):
    print(tagged_corpus_en[top[0]].words)

['aboriginal', 'heritage', 'act', 'south', 'australian', 'legislation']
['bhramar', 'mukherjee', 'indian', 'american', 'biostatistician', 'data', 'scientist']
['bombay', 'rava']
['gokana', 'kana', 'nigeria', 'ethnic', 'group']
['kalinda', 'griffith', 'kalinda', 'griffith', 'kalinda', 'e', 'griffith', 'australian', 'indigenous', 'health', 'researcher']
['french', 'people', 'madagascar', 'ethnic', 'group']
['binza', 'watercourse', 'kasai', 'oriental', 'democratic', 'republic', 'congo']
['instance', 'macromolecular', 'complex', 'homo', 'sapiens', 'reactome', 'id', 'r', 'hsa']
['nkum', 'nigeria', 'ethnic', 'group']
['negro', 'romance', 'comic', 'book', 'series', 'comic', 'book']
