4.1 (8 points) Evaluate the word vectors (EVALWS) corresponding to the three ways of computing vectors (counts, IDF, and PMI), three values of w (1, 3, and 6), and two context vocabularies
(vocab-15kws.txt and vocab-5k.txt). For all cases, use vocab-15kws.txt for V . Report the
results (there should be 36 correlations in all) and describe your findings. What happens as window
size changes for different methods of creating word vectors? What happens when context vocabulary
changes? Why do you think you observe the trends you see? Do you see the same trends for MEN and
SimLex or do they differ?

For Count method

In [None]:
from collections import defaultdict
import math
from scipy.stats import rankdata

# Function to find indices for middle word.
def find_indexes(arr, words_set):
    return [(i, w) for i, w in enumerate(arr) if w in words_set]

# Function to find count for word vectors.
def word_vector_count(w, vocab, vocab_context, lines):
    V = set(vocab)
    Vc = set(vocab_context)

    my_dict = defaultdict(int)

    for line in lines:
        words = line.split()
        length = len(words)
        word_indexes = find_indexes(words, V)

        for idx, word in word_indexes:
            ind_l = max(0, idx - w)
            ind_h = min(length, idx + w + 1)
            context_window = words[ind_l:idx] + words[idx + 1:ind_h]

            for context_word in context_window:
                if context_word in Vc:
                    my_dict[(word, context_word)] += 1

    return my_dict

In [None]:
from collections import defaultdict
import math
from scipy.stats import rankdata
# Function to deduce word pairs and their scores from given dataset.
def make_word_pairs(path):
  word_pairs = []

  word_pair_scores = {}
  with open(path) as f3:
      next(f3)
      for line in f3:
          word1, word2, score = line.split()
          pair = (word1, word2)
          word_pairs.append(pair)

          word_pair_scores[pair] = float(score)

  return word_pairs, word_pair_scores

In [None]:
from collections import defaultdict
import math
from scipy.stats import rankdata

# Function to calculate cosine similarity.
def cosine_similarity(vec1, vec2):
    dot_product = sum(vec1[word] * vec2.get(word, 0) for word in vec1)
    norm_vec1 = math.sqrt(sum(value ** 2 for value in vec1.values()))
    norm_vec2 = math.sqrt(sum(value ** 2 for value in vec2.values()))

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0

    return dot_product / (norm_vec1 * norm_vec2)

In [None]:
from collections import defaultdict
import math
from scipy.stats import rankdata


# Function to calculate Spearman correlation.
def spearman_rank_correlation(predicted_similarities, actual_scores):
    predicted = [predicted_similarities[pair] for pair in actual_scores if pair in predicted_similarities]
    actual = [actual_scores[pair] for pair in actual_scores if pair in predicted_similarities]

    if not predicted:
        return None

    predicted_ranks = rankdata(predicted)
    actual_ranks = rankdata(actual)

    n = len(predicted_ranks)
    rank_differences_squared = [(pred_r - act_r) ** 2 for pred_r, act_r in zip(predicted_ranks, actual_ranks)]

    return 1 - (6 * sum(rank_differences_squared)) / (n * (n**2 - 1))

In [None]:
from collections import defaultdict
import math
from scipy.stats import rankdata

def load_file(path):
    with open(path, "r") as file:
        return file.read().splitlines()

def count_lines(Vc, lines):
    total_lines = 0
    context_word_line_counts = defaultdict(int)

    for line in lines:
        total_lines += 1
        words = set(line.split())
        for word in words:
            if word in Vc:
                context_word_line_counts[word] += 1

    return total_lines, context_word_line_counts

# Defining context window size, different paths, different context word vocab and different methods.
context_window_sizes = [1, 3, 6]
word_pairs_paths = ["/men.txt", "/simlex-999.txt"]
context_vocab_paths = ["/vocab-15kws.txt", "/vocab-5k.txt"]
methods = {"count", "tf-idf", "pmi"}

wiki_lines = load_file("/wiki-1percent.txt")

for path in word_pairs_paths:
    word_pairs, word_pair_scores = make_word_pairs(path)

    for context_vocab in context_vocab_paths:
        vocab_context = load_file(context_vocab)

        for w in context_window_sizes:
            word_vectors = defaultdict(dict)
            vocab = load_file("/vocab-15kws.txt")
            my_dict = word_vector_count(w, vocab, vocab_context, wiki_lines)
            for (word, context_word), count in my_dict.items():
                word_vectors[word][context_word] = count


            cosine_similarities = {pair: cosine_similarity(word_vectors.get(pair[0], {}), word_vectors.get(pair[1], {}))
                                   for pair in word_pairs}

            spearman_rho = spearman_rank_correlation(cosine_similarities, word_pair_scores)
            print(f"Spearman's ρ for path = {path}, context_vocab = {context_vocab}, w = {w}, method = count: {spearman_rho}")


Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 1, method = count: 0.2066309206256578
Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 3, method = count: 0.22100748000083115
Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 6, method = count: 0.2371355346817261
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 1, method = count: 0.20932403959155998
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 3, method = count: 0.22536738248526467
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 6, method = count: 0.2412897444766383
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 1, method = count: 0.07002953354155761
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 3, method = count: 0.05715835575054007
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 6, method = count: 0.0406700096

4.1) For TF-IDF Method

In [None]:
for path in word_pairs_paths:
    word_pairs, word_pair_scores = make_word_pairs(path)

    for context_vocab in context_vocab_paths:
        vocab_context = load_file(context_vocab)

        for w in context_window_sizes:
            word_vectors = defaultdict(dict)
            vocab = load_file("/vocab-15kws.txt")
            my_dict = word_vector_count(w, vocab, vocab_context, wiki_lines)
            total_lines, count_context_word_lines = count_lines(set(vocab_context), wiki_lines)
            for (word, context_word), count in my_dict.items():
                idf = (total_lines / count_context_word_lines[context_word]) if count_context_word_lines[context_word] > 0 else 0
                word_vectors[word][context_word] = count * idf


            cosine_similarities = {pair: cosine_similarity(word_vectors.get(pair[0], {}), word_vectors.get(pair[1], {}))
                                   for pair in word_pairs}

            spearman_rho = spearman_rank_correlation(cosine_similarities, word_pair_scores)
            print(f"Spearman's ρ for path = {path}, context_vocab = {context_vocab}, w = {w}, method = tf-idf: {spearman_rho}")

Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 1, method = tf-idf: 0.3663543634838181
Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 3, method = tf-idf: 0.48110488290054254
Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 6, method = tf-idf: 0.5252486262498474
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 1, method = tf-idf: 0.3477510394167822
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 3, method = tf-idf: 0.47297401866377986
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 6, method = tf-idf: 0.5325364850596095
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 1, method = tf-idf: 0.18721373377385397
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 3, method = tf-idf: 0.14786738341547956
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 6, method = tf-idf: 0.10

4.1) For PMI method

In [None]:
for path in word_pairs_paths:
    word_pairs, word_pair_scores = make_word_pairs(path)

    for context_vocab in context_vocab_paths:
        vocab_context = load_file(context_vocab)

        for w in context_window_sizes:
            word_vectors = defaultdict(dict)
            vocab = load_file("/vocab-15kws.txt")
            my_dict = word_vector_count(w, vocab, vocab_context, wiki_lines)
            total_count_N = sum(my_dict.values())
            joint_probabilities = {pair: count / total_count_N for pair, count in my_dict.items()}

            partial_prob_x = {word: sum(joint_probabilities.get((word, cw), 0) for cw in set(vocab_context)) for word in set(vocab)}
            partial_prob_y = {cw: sum(joint_probabilities.get((word, cw), 0) for word in set(vocab)) for cw in set(vocab_context)}

            for (word, context_word), joint_prob in joint_probabilities.items():
              pmi = math.log2(joint_prob / (partial_prob_x[word] * partial_prob_y[context_word])) if joint_prob > 0 else 0
              word_vectors[word][context_word] = pmi


            cosine_similarities = {pair: cosine_similarity(word_vectors.get(pair[0], {}), word_vectors.get(pair[1], {}))
                                   for pair in word_pairs}

            spearman_rho = spearman_rank_correlation(cosine_similarities, word_pair_scores)
            print(f"Spearman's ρ for path = {path}, context_vocab = {context_vocab}, w = {w}, method = pmi: {spearman_rho}")

Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 1, method = pmi: 0.4703925852658428
Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 3, method = pmi: 0.519534365170485
Spearman's ρ for path = /men.txt, context_vocab = /vocab-15kws.txt, w = 6, method = pmi: 0.5275549925061103
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 1, method = pmi: 0.43376961586329066
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 3, method = pmi: 0.46578947742105303
Spearman's ρ for path = /men.txt, context_vocab = /vocab-5k.txt, w = 6, method = pmi: 0.4725634710626079
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 1, method = pmi: 0.26807761167981614
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 3, method = pmi: 0.2123053103203404
Spearman's ρ for path = /simlex-999.txt, context_vocab = /vocab-15kws.txt, w = 6, method = pmi: 0.16092585471242782
Spearman's ρ 