In [None]:
import os
import io
import sys
import pandas as pd
import numpy as np
import scipy
from scipy.spatial.distance import cosine
from gensim.models.word2vec import Word2Vec

## W2V

In [None]:
def get_word_freqs(words):
    """ Returns dictionary frequency of word frequencies of words in a list. """
    freqs = {w: 0 for w in words}
    for word in words:
        freqs[word] += 1
    return freqs

In [None]:
def load_vector_dict(vec_fp, words):
    """ Utility to load word embeddings from a .vec file into a dictionary. """
    vdict = {}

    model = Word2Vec.load(vec_fp)

    for word in words:
        try:
            vdict[word] = model.wv[word]
        except:
            pass

    #assert len(vdict) == len(words), "Not all target words were found in the *.vec file {}!".format(vec_fp)

    return vdict

In [None]:
def preprocess_text(text, export_fp, targets, filtered=True):
    """ Preprocesses a single text by filtering words for frequency and sentences for length. """

    lines = text.split("\n")
    all_words = text.split()

    if filtered:

        # determine threshold
        min_freq = len(lines) // 50_000

        word_freqs = get_word_freqs(all_words)

        keep_words = set([word for word, freq in word_freqs.items() if freq >= min_freq])
        target_words = set(targets)

        # print([x for x in word_freqs if x.startswith('dem')])

        missing_targets = target_words - keep_words

        # for target in targets:
        #     # Print freq
        #     print('Freq:',target, word_freqs[target])

        if len(missing_targets) > 0:
            print("Keeping the following target words despite frequencies below {}:\n".format(min_freq), missing_targets)
            keep_words = keep_words.union(target_words)

    else:

        keep_words = set(all_words)


    new_lines = []

    for line in lines:
        new_words = [word for word in line.split() if word in keep_words]
        if len(new_words) > 1:
            new_lines.append(" ".join(new_words))

    with open(export_fp, "w+") as fh:
        fh.write("\n".join(new_lines))

In [None]:
def preprocess_texts(c1_path, c2_path, c3_path, targets_path, experiment_dir):
    """ Removes words below the frequency threshold in both corpora. """

    with open(c1_path, "r") as fh:
        c1 = fh.read()

    with open(c2_path, "r") as fh:
        c2 = fh.read()

    with open(c3_path, "r") as fh:
        c3 = fh.read()

    with open(targets_path, 'r', encoding='utf-8') as f:
        targets = [word.strip() for word in f.readlines()]

    prep_dir = experiment_dir + "preprocessed_texts/"
    os.makedirs(prep_dir, exist_ok=True)

    preprocess_text(c1, prep_dir + "c1.txt", targets)
    preprocess_text(c2, prep_dir + "c2.txt", targets)
    preprocess_text(c3, prep_dir + "c3.txt", targets)

In [None]:
def train_word2vec(experiment_dir, n_window=10, dim=300, **kwargs):
    """ Vectorizes all words in the two corpora separately with Word2Vec. """

    vec_dir = experiment_dir + "word_representations/"
    os.makedirs(vec_dir, exist_ok=True)

    prep_dir = experiment_dir +  "preprocessed_texts/"

    with open(prep_dir + "c1.txt", 'r', encoding='utf-8') as f1:
        sentences1 = [[word for word in sent.split()] for sent in f1.readlines()]

    with open(prep_dir + "c2.txt", 'r', encoding='utf-8') as f2:
        sentences2 = [[word for word in sent.split()] for sent in f2.readlines()]

    with open(prep_dir + "c3.txt", 'r', encoding='utf-8') as f3:
        sentences3 = [[word for word in sent.split()] for sent in f3.readlines()]

    model1 = Word2Vec(sentences1, vector_size=dim, window=n_window, min_count=1)
    model1.save(vec_dir + "c1.vec")

    model2 = Word2Vec(sentences2, vector_size=dim, window=n_window, min_count=1)
    model2.save(vec_dir + "c2.vec")

    model3 = Word2Vec(sentences3, vector_size=dim, window=n_window, min_count=1)
    model3.save(vec_dir + "c3.vec")

In [None]:
def intersection_align_gensim(m1, m2):
    """
     Intersect two gensim word2vec models.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1, m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key

    assert len(m1.wv.key_to_index) == len(m1.wv.key_to_index)

    return m1, m2

In [None]:
def smart_procrustes_align_gensim(vec_dir, base_path, other_path):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.

    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    """

    base_embed = Word2Vec.load(vec_dir + base_path + ".vec")
    other_embed = Word2Vec.load(vec_dir + other_path + ".vec")

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs)
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v)
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)

    in_base_embed.save(vec_dir + base_path + other_path[1] + "_based_" + base_path + "_aligned.vec")
    other_embed.save(vec_dir + base_path + other_path[1] + "_based_" + other_path + "_aligned.vec")

In [None]:
def align_embeddings(vec_dir):
    """
    Pairwise alignment.
    """

    pairs = [["c1", "c2"], ["c2", "c3"], ["c1", "c3"]]

    for pair in pairs:
        smart_procrustes_align_gensim(vec_dir, pair[0], pair[1])

In [None]:
def compare_context_free_representations(targets_path, model1_path, model2_path):
    """ Compares aligned embeddings for all target words and makes a prediction. """

    with open(targets_path, 'r', encoding='utf-8') as f:
        targets = [word.strip() for word in f.readlines()]

    c1_dict = load_vector_dict(model1_path, targets)
    c2_dict = load_vector_dict(model2_path, targets)

    dists = [{"word": target, "change": cosine(c1_dict[target], c2_dict[target])} for target in targets if target in c1_dict and target in c2_dict]

    return dists

In [None]:
def compare_all_representations(targets_path, vec_dir):
    """
    Pairwise comparison.
    """

    pairs = [["c1", "c2"], ["c2", "c3"], ["c1", "c3"]]

    with open(targets_path, 'r', encoding='utf-8') as f:
        targets = [word.strip() for word in f.readlines()]

    dists = {target: [] for target in targets}
    for pair in pairs:
        model1_path = vec_dir + pair[0] + pair[1][1] + "_based_" + pair[0] + "_aligned.vec"
        model2_path = vec_dir + pair[0] + pair[1][1] + "_based_" + pair[1] + "_aligned.vec"
        new_dists = compare_context_free_representations(targets_path, model1_path, model2_path)

        for item in new_dists:
            dists[item["word"]].append(item["change"])

    results = [{"word": word, "presov-sov": dists[word][0], "sov-postsov": dists[word][1], "presov-postsov": dists[word][2]} for word in dists]
    pd.DataFrame(results).to_csv(vec_dir + "results.csv", sep='\t', index=False, header=False)

## Запуск

In [None]:
!mkdir /content/dataset_dir
!mkdir /content/experiment_dir

mkdir: cannot create directory ‘/content/dataset_dir’: File exists
mkdir: cannot create directory ‘/content/experiment_dir’: File exists


In [None]:
!wget -O /content/dataset_dir/corpus19.txt https://raw.githubusercontent.com/Timofeidedov/NLP_Finalproj/main/corpus1.txt
!wget -O /content/dataset_dir/corpus20.txt https://raw.githubusercontent.com/Timofeidedov/NLP_Finalproj/main/corpus2.txt
!wget -O /content/dataset_dir/corpus21.txt https://raw.githubusercontent.com/Timofeidedov/NLP_Finalproj/main/corpus3.txt

--2024-03-24 19:05:47--  https://raw.githubusercontent.com/Timofeidedov/NLP_Finalproj/main/corpus1.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1887276 (1.8M) [text/plain]
Saving to: ‘/content/dataset_dir/corpus19.txt’


2024-03-24 19:05:48 (50.1 MB/s) - ‘/content/dataset_dir/corpus19.txt’ saved [1887276/1887276]

--2024-03-24 19:05:48--  https://raw.githubusercontent.com/Timofeidedov/NLP_Finalproj/main/corpus2.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1723561 (1.6M) [text/plain]
Saving to: ‘/content/dataset_dir/c

In [None]:
!wget -O /content/dataset_dir/targets.tsv https://raw.githubusercontent.com/akutuzov/rushifteval_public/main/annotated_testset.tsv

--2024-03-24 19:05:49--  https://raw.githubusercontent.com/akutuzov/rushifteval_public/main/annotated_testset.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6509 (6.4K) [text/plain]
Saving to: ‘/content/dataset_dir/targets.tsv’


2024-03-24 19:05:49 (49.0 MB/s) - ‘/content/dataset_dir/targets.tsv’ saved [6509/6509]



In [None]:
targets_fp = "/content/dataset_dir/targets.tsv"
targets_df = pd.read_csv(targets_fp, names=["word", "rate1", "rate2", "rate3"], delimiter='\t')
target_words = targets_df["word"].to_list()

target_words_fp = "/content/dataset_dir/target_words.txt"
with open(target_words_fp, 'w', encoding="utf-8") as f_target_words:
    f_target_words.write('\n'.join(target_words))

In [None]:
c1_path = "/content/dataset_dir/corpus19.txt"
c2_path = "/content/dataset_dir/corpus20.txt"
c3_path = "/content/dataset_dir/corpus21.txt"
targets_path = "/content/dataset_dir/target_words.txt"
experiment_dir = "/content/experiment_dir/"
vec_dir = experiment_dir + "word_representations/"

In [None]:
preprocess_texts(c1_path, c2_path, c3_path, targets_path, experiment_dir)

In [None]:
train_word2vec(experiment_dir)

In [None]:
align_embeddings(vec_dir)

In [None]:
compare_all_representations(targets_path, vec_dir)