3.1 (8 points) Implement the capability of computing PMIs. Use your code to calculate PMIs for w = 3 when using vocab-15kws.txt to populate V and vocab-5k.txt to populate VC. Note that since we are using different vocabularies for center words and context words, pmi(a, b) will not necessarily equal pmi(b, a) (though they will be similar). (If there is a word in V that has no counts, the numerator and denominator for all of its PMI values will be zero, so you can just define all such PMIs to be zero.) For center word x = “coffee”, print the 10 context words with the largest PMIs and the 10 context words with the smallest PMIs. Print both the words and the PMI values

In [1]:
from scipy.stats import rankdata

# function to compute Spearman correlation.
def spearman_rank_correlation(predicted_similarities, actual_scores):

    predicted = [predicted_similarities[pair] for pair in actual_scores if pair in predicted_similarities]
    actual = [actual_scores[pair] for pair in actual_scores if pair in predicted_similarities]

    predicted_ranks = rankdata(predicted)
    actual_ranks = rankdata(actual)

    n = len(predicted_ranks)
    if n == 0:
        return None

    rank_differences_squared = [(pred_r - act_r) ** 2 for pred_r, act_r in zip(predicted_ranks, actual_ranks)]
    spearman_rho = 1 - (6 * sum(rank_differences_squared)) / (n * (n**2 - 1))

    return spearman_rho

In [3]:
from collections import defaultdict
import math

def find_indexes(arr, words_set):
    return [(i, w) for i, w in enumerate(arr) if w in words_set]


with open("/vocab-15kws.txt") as f:
    vocab = f.read().split()

with open("/vocab-5k.txt") as f1:
    vocab_context = f1.read().split()

V = set(vocab)
Vc = set(vocab_context)


my_dict = defaultdict(int)


with open("/wiki-1percent.txt", "r") as file:
    for line in file:

        words = line.split()
        length = len(words)

        word_indexes = find_indexes(words, V)


        for idx, word in word_indexes:
            ind_l = max(0, idx - 3)
            ind_H = min(length, idx + 4)
            context_window = words[ind_l:idx] + words[idx + 1:ind_H]
            for context_word in context_window:
                if context_word in Vc:
                    key = (word, context_word)
                    my_dict[key] += 1

# To add #(x,y) for all x and y.
total_count_N = sum(my_dict.values())

# To compute joint probabilities of (x,y). 
joint_probabilities = defaultdict(float)
for (word, context_word), count in my_dict.items():
    joint_probabilities[(word, context_word)] = count / total_count_N

# To compute partial probability of x.
partial_probability_x = defaultdict(float)
for word in V:
    partial_probability_x[word] = sum(joint_probabilities[(word, context_word)] for context_word in Vc)

# To compute partial probability of x.
partial_probability_y = defaultdict(float)
for context_word in Vc:
    partial_probability_y[context_word] = sum(joint_probabilities[(word, context_word)] for word in V)

# To calculate PMI.
pointwise_mutual_information = defaultdict(float)
for (word, context_word), joint_prob in joint_probabilities.items():
    partial_prob_x = partial_probability_x[word]
    partial_prob_y = partial_probability_y[context_word]

    if joint_prob > 0 and partial_prob_x > 0 and partial_prob_y > 0:
        pointwise_mutual_information[(word, context_word)] = math.log2(joint_prob / (partial_prob_x * partial_prob_y))
    else:
        pointwise_mutual_information[(word, context_word)] = 0

coffee_pmi = {context_word: pmi for (word, context_word), pmi in pointwise_mutual_information.items() if word == 'coffee'}

sorted_pmi = sorted(coffee_pmi.items(), key=lambda item: item[1], reverse=True)

print("Highest PMI context words for 'coffee':-")
for word, pmi in sorted_pmi[:10]:
    print(f"{word}:-> {pmi}")

print("\nLowest PMI context words for 'coffee':-")
for word, pmi in sorted_pmi[-10:]:
    print(f"{word}:-> {pmi}")


Highest PMI context words for 'coffee':-
tea:-> 8.166001262432944
drinking:-> 7.5879786587319416
shop:-> 7.411693771493206
costa:-> 7.350256393786174
shops:-> 7.2607518734184815
sugar:-> 6.533949521544224
coffee:-> 6.50197713180594
mix:-> 6.131195903101994
seattle:-> 5.950816325067406
houses:-> 5.868161497268194

Lowest PMI context words for 'coffee':-
page:-> -1.2805627423999117
when:-> -1.4043486976804662
more:-> -1.478525792288141
after:-> -1.598505205572077
its:-> -1.839457915441183
not:-> -1.9115928402013347
this:-> -1.9795498179341677
had:-> -1.9875291676196636
be:-> -2.1509730526874753
he:-> -2.260338264952694


3.2 (6 points) Now, define word vectors using PMI. That is, the word vector for a word x ∈ V has an
entry for each word y ∈ VC with value given by pmi(x, y). As above, use w = 3, vocab-15kws.txt
to populate V , and vocab-5k.txt to populate VC. Evaluate (EVALWS) your PMI-based word vectors
and report your results.


In [4]:
from collections import defaultdict
import math

word_vectors = defaultdict(dict)

# Load word vectors with PMI value.
for (word, context_word), pmi in pointwise_mutual_information.items():
    word_vectors[word][context_word] = pmi

For men dataset

In [5]:
from collections import defaultdict
import math

word_pairs = []

word_pair_scores = {}

# To find word pairs and scores from given dataset.
with open("/men.txt") as f3:
    next(f3)

    for line in f3:
        word1, word2, score = line.split()
        pair = (word1, word2)
        word_pairs.append(pair)
        word_pair_scores[pair] = float(score)

In [6]:
import math
from collections import defaultdict

# Function to compute cosine similarity.
def cosine_similarity(vec1, vec2):

    dot_product = sum(vec1[word] * vec2[word] for word in vec1 if word in vec2)

    norm_vec1 = math.sqrt(sum(value ** 2 for value in vec1.values()))

    norm_vec2 = math.sqrt(sum(value ** 2 for value in vec2.values()))

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0

    return dot_product / (norm_vec1 * norm_vec2)

cosine_similarities_men = {}

for (word1, word2) in word_pairs:
    if word1 in word_vectors and word2 in word_vectors:

        vec1 = word_vectors[word1]
        vec2 = word_vectors[word2]

        similarity = cosine_similarity(vec1, vec2)
        cosine_similarities_men[(word1, word2)] = similarity

In [7]:
spearman_rho = spearman_rank_correlation(cosine_similarities_men, word_pair_scores)
print(f"Spearman's ρ for MEN dataset for w = 3: {spearman_rho}")

Spearman's ρ for MEN dataset for w = 3: 0.46578947742105303


For simlex-999 dataset

In [8]:
word_pairs = []
word_pair_scores = {}

with open("/simlex-999.txt") as f4:
    next(f4)
    for line in f4:
        word1, word2, simlex999 = line.split()
        pair = (word1, word2)
        word_pairs.append(pair)
        word_pair_scores[pair] = float(simlex999)

In [10]:
import math
from collections import defaultdict

cosine_similarities_simlex999 = {}

for (word1, word2) in word_pairs:
    if word1 in word_vectors and word2 in word_vectors:
        vec1 = word_vectors[word1]
        vec2 = word_vectors[word2]

        similarity = cosine_similarity(vec1, vec2)

        cosine_similarities_simlex999[(word1, word2)] = similarity

In [11]:
spearman_rho = spearman_rank_correlation(cosine_similarities_simlex999, word_pair_scores)
print(f"Spearman's ρ for simlex-999 dataset for w = 3: {spearman_rho}")

Spearman's ρ for simlex-999 dataset for w = 3: 0.18644573531447284
