In [28]:
import itertools
import random, math
import re
from typing import List, Dict

from collections import OrderedDict, Counter

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from nltk.corpus import brown, gutenberg
from nltk.probability import FreqDist
from nltk.corpus import stopwords


# Corpus

In [2]:
gutenberg.fileids()[3]

'bible-kjv.txt'

In [3]:
samples = gutenberg.sents(gutenberg.fileids()[3])
pattern = re.compile("[A-Za-z]+")
stop_w =  set(stopwords.words('english'))
corpus = []
for sent in samples:
    sent = [w.lower() for w in sent]
    sent = [w for w in sent if w not in stop_w]
    sent = [w.replace('\n', ' ') for w in sent]
    sent = [w for w in sent if pattern.fullmatch(w)]
    if len(sent) > 5:
        corpus.append(sent)

In [4]:
len(corpus)

25481

## phrase

In [16]:
bigram = Phrases(corpus, min_count=5, threshold=3)
bigram_phraser = Phraser(bigram)
corpus = bigram_phraser[corpus]

trigram = Phrases(corpus, min_count=5, threshold=3)
trigram_phraser = Phraser(trigram)
corpus = trigram_phraser[corpus]

# Train w2v

In [20]:
w2v_model = Word2Vec(
        min_count=3,
        window=5,
        size=100,
        alpha=0.005,
        min_alpha=0.0007,
        hs=1,
        sg=0,
        workers=4,
        batch_words=100,
        cbow_mean = 1
    )
w2v_model.build_vocab(corpus) # build huffman tree

In [21]:
w2v_model.train(
        corpus,
        total_examples=w2v_model.corpus_count,
        epochs=50,
        report_delay=1)

(13122157, 14494800)

In [22]:
w2v_model.wv.most_similar("christ", topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('jesus_christ', 0.8696889877319336),
 ('christ_jesus', 0.844677746295929),
 ('faith', 0.8157445192337036),
 ('gospel', 0.8098512291908264),
 ('believe', 0.788714587688446),
 ('justified', 0.7848519682884216),
 ('faithful', 0.7752644419670105),
 ('lord_jesus_christ', 0.7724319696426392),
 ('world', 0.7681786417961121),
 ('circumcision', 0.7642426490783691)]

In [45]:
def cal_log_probs(model, target_w, context_embd: np.ndarray)-> np.ndarray:
    turns = (-1.0) ** target_w.code
    path_embd = model.trainables.syn1[target_w.point]
    log_probs = -np.logaddexp(0, -turns * np.dot(context_embd, path_embd.T))
    return np.sum(log_probs)

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def _cal_keyword_score(model, sentence:List[str]) -> Dict[str, float]:
    word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab]
    
    word_importance = {}
    for pos_center, center_w in enumerate(word_vocabs):
        context_w_indices = [w.index for pos_w, w in enumerate(word_vocabs) if pos_center != pos_w]
        context_embed = np.mean(model.wv.vectors[context_w_indices], axis=0)
        log_probs = cal_log_probs(model, center_w, context_embed)
        
        center_w_term = w2v_model.wv.index2word[center_w.index]
        word_importance[center_w_term] = word_importance.get(center_w_term, 0) + log_probs
    return word_importance

def cal_keyword_score(model, sentence: List[str]) -> np.ndarray:
    word_importance = _cal_keyword_score(model, sentence)
    ds = pd.Series(word_importance).sort_values(ascending=False)
    
    scalar = MinMaxScaler(feature_range=(0.1, 1))
    array = ds.to_numpy()
    array = scalar.fit_transform(array.reshape(array.shape[0], -1))
    ds = pd.Series(array.reshape(-1, ), index=ds.index)
    return ds

def weighted_sum_w2v(model, ds: pd.Series) -> np.ndarray:
    ds_  = ds.copy() / sum(ds)
    w2v = w2v_model.wv[ds_.index]
    weights = np.expand_dims(ds_.to_numpy(), 1)
    
    return np.sum((w2v * weights), axis=0)

In [46]:
sent = list(itertools.chain.from_iterable(corpus[998: 1000]))
len(sent)

19

In [80]:
sent = corpus[535]

ds = cal_keyword_score(w2v_model, sent)
print(sent), print(ds)

['unto', 'sons', 'concubines', 'abraham', 'abraham', 'gave', 'gifts', 'sent_away', 'isaac', 'son', 'yet', 'lived', 'eastward', 'unto', 'east', 'country']
sons          1.000000
son           0.954965
isaac         0.903312
gave          0.781142
yet           0.689830
lived         0.501238
sent_away     0.460567
concubines    0.450501
gifts         0.429287
abraham       0.339437
country       0.326302
eastward      0.201623
unto          0.100229
east          0.100000
dtype: float64


(None, None)

In [111]:
weighted_vector = weighted_sum_w2v(w2v_model, ds)
weighted_vector.shape

(100,)

In [112]:
l2norm = np.linalg.norm(weighted_vector, 2, axis=0, keepdims=True)
l2norm

array([6.57048685])

In [113]:
norm_weighted_vector = weighted_vector / l2norm
np.linalg.norm(norm_weighted_vector, 2, axis=0, keepdims=True)

array([1.])