# Import required libraries

In [None]:
!pip install wget # to download data
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install gensim
!pip install nltk
!pip install scikit-learn

In [None]:
%matplotlib inline
import numpy as np
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
import wget
import spacy
from spacy.tokenizer import Tokenizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
import scipy.stats
import tqdm

from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import re
import os
from collections import defaultdict
import jsonlines

import zipfile

%matplotlib widget
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Implicit representations

1. Create an embedding for the words using the (cleaned) Mosaico dataset by fitting a model.
2. Replace WordNet senses for each word and then create embeddings as before.

We can use different models to create word/sense embeddings and log which one performs better.

# Preprocessing experiments

In [None]:
# Take all the lines in the file
lines = []
with jsonlines.open('data/sample_annotated_sentences/500000.jsonl') as reader:

    limit = 10
    for i, line in enumerate(reader):

        if i < limit:
            print(f'{i:3d} | {line}')

        lines.append(line)

In [None]:
def get_num_digits(number):
    return len(str(abs(number)))

def get_token_and_index_strings(text):
    line_tokens = text.split()
    line_tokens_print = []
    line_tokens_ids_print = []
    for i, token in enumerate(line_tokens):
        if get_num_digits(i) > len(token):
            line_tokens_print.append((' ' * (get_num_digits(i) - len(token))) + token)
        else:
            line_tokens_print.append(token)
        line_tokens_ids_print.append((' '*(len(line_tokens[i]) - len(str(i)))) + f'{i}')
    return ' '.join(line_tokens_print), ' '.join(line_tokens_ids_print)

def pretty_print(line, print_indexes=False):
    
    # Print the text
    text = line['text']
    
    # Print the indexes
    tokens_string, indexes_string = get_token_and_index_strings(text)
    print(f'text: {tokens_string}')
    if print_indexes:
        print(f'      {"".join(indexes_string)}')

    # Print the annotations
    annotations = line['annotations']
    if len(annotations) > 0:
        print(f'annotations: {annotations[0]}')
        for i in range(1, len(annotations)):
            print(f"{' '*(len('annotations: '))}{annotations[i]}")


In [None]:
# sample_line_1 = lines[0].copy()
sample_line_1 = {
    'text': "In Hinduism , the 60th birthday of a man is called Sashti poorthi .",
    'annotations': [
        {'token_span': [5, 6], 'label': 'birthday%1:28:00::'},
        {'token_span': [8, 9], 'label': 'man%1:18:00::'}
    ]
}

# sample_line_2 = lines[5].copy()
sample_line_2 = {
    'text': "The new world of English words came out in 1658 and a dictionary of 40,000 words had been prepared in 1721 by Nathan Bailey , though none was as comprehensive in breadth or style as Johnson's .",
    'annotations': [
        {'token_span': [1, 2], 'label': 'world%1:14:01::'},
        {'token_span': [4, 5], 'label': 'word%1:10:00::'},
        {'token_span': [11, 12], 'label': 'dictionary%1:10:00::'},
        {'token_span': [14, 15], 'label': 'word%1:10:00::'},
        {'token_span': [32, 33], 'label': 'style%1:10:00::'}
    ]
}

# sample_line_3 = lines[35].copy()
sample_line_3 = {
    'text': "Even without the availability of either co-receptor ( even CCR5 ) , the virus can still invade cells if gp41 were to go through an alteration ( including its cytoplasmic tail ) that resulted in the independence of CD4 without the need of CCR5 and / or CXCR4 as a doorway .",
    'annotations': [
        {'token_span': [17, 18], 'label': 'cell%1:03:00::'},
        {'token_span': [30, 31], 'label': 'tail%1:05:00::'},
        {'token_span': [50, 51], 'label': 'doorway%1:06:00::'}
    ]
}

# sample_line_4 = lines[15].copy()
sample_line_4 = {
    'text': "Typically , NATO inert munitions are painted entirely in light blue and / or have the word \" INERT \" stenciled on them in prominent locations .[ citation needed ] IED ( barrel bomb , nail bomb , pipe bomb , pressure cooker bomb , fertilizer bomb , molotov cocktail )", 
    'annotations': [
        {"token_span": [16, 17], "label": "word%1:10:00::"}, 
        {"token_span": [35, 36], "label": "nail%1:06:00::"}, 
        {"token_span": [48, 49], "label": "cocktail%1:13:00::"}
    ]
}

# sample_line_5 = lines[51].copy()
sample_line_5 = {
    'text': "Later on 9 January , Samuel scored a last - minute winner in the 4\u20133 win versus Siena by beating goalkeeper Gianluca Curci with a left - footed shot .", 
    'annotations': [
        {"token_span": [10, 11], "label": "minute%1:28:00::"}, 
        {"token_span": [11, 12], "label": "winner%1:18:00::"}, 
        {"token_span": [15, 16], "label": "win%1:11:00::"}, 
        {"token_span": [27, 28], "label": "foot%2:38:00::"}
    ]
}

# sample_line_6 = lines[502].copy()
sample_line_6 = {
    'text': "In a 1960 piano performance in Cologne , he played Chopin , threw himself on the piano and rushed into the audience , attacking Cage and pianist David Tudor by cutting their clothes with scissors and dumping shampoo on their heads .", 
    'annotations': [
        {"token_span": [24, 25], "label": "cage%1:18:00::"}, 
        {"token_span": [32, 33], "label": "clothes%1:06:00::"}, 
        {"token_span": [40, 41], "label": "head%1:08:00::"}
    ]
}


sample_lines = [
    # sample_line_1,
    sample_line_2,
    # sample_line_3,
    sample_line_4,
    sample_line_5,
    sample_line_6
]

for sample_line in sample_lines:
    pretty_print(sample_line, print_indexes=True)
    print()

In [None]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load tje pretrained spaCy model
nlp = spacy.load("en_core_web_sm")

# Regex to discard numbers, special characters and punctuation
regexp_alphbetic = re.compile('[^a-zA-Z-]+')

In [None]:
# Check how to spaCy model works 

from spacy.lang.en.examples import sentences 
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

There are sentences that are tokenized differently with respect to a simple split. We can't use token_span if we tokenize with the spaCy model. In some cases this is due to hyphens (`co-authored`), in other cases this is due to apostrophes (`Johnson's`) or to something like `.[` that is split into `.` and `[` from the spacy tokenizer and into a single token from a simple string.split(). We need to split the text in the same way both with the spaCy tokenizer and with the split.

In [None]:
def count_different_tokenization_indexes(lines, model, print_lines=False):

    count = 0
    for line in lines:

        text = line['text']
        annotations = line['annotations']

        tokens = text.split()

        doc = model(text)
        tokens_doc = [token.text for token in doc]

        if len(tokens) != len(tokens_doc):
            count += 1

            if print_lines:
                print(text)
                print(f'{tokens}')
                print(f'{tokens_doc}')
                for annotation in annotations:
                    print(annotation)
                print()

    return count

test_limit = 1000
count = count_different_tokenization_indexes(lines[:test_limit], nlp, print_lines=True)
print(f'{count}/{test_limit} sentences are tokenized differently with the standard tokenizer')

Hyphens (-) are used to join two or more words that act as a single term, to form some compound word. Looping through the tokens produced by the model as it is we will end up with two separate tokens for intra-hyphen words; we can consider both of them, but the position of the senses in the annotations will differ from the original one. Looping through the tokens produced by a split we will end up with a single token for intra-hyphen words; this will ensure that the annotations indexes are respected but the single token won't be correctly lemmatized by the spacy model unless we implement some complex logic to account for hyphens. We could split the text with respect to spaces and dashes but even in that case the index of the annotated words will differ:

In [None]:
dashed_text = "laptop-cover co-authored well-known stay-at-home 40.000 L'hymnaire"

# Define the splitting pattern to match spaces and dashes
split_pattern = re.compile(r'[ -]')

# Split the string using the pattern
result = re.split(split_pattern, dashed_text)

# Filter out empty strings resulting from consecutive spaces or dashes
result = [s for s in result if s]

print(result)


We can define a [custom tokenizer](https://stackoverflow.com/questions/55241927/spacy-intra-word-hyphens-how-to-treat-them-one-word) for the model that treats hyphenated words this as a single token:

In [None]:
def custom_tokenizer(nlp):
    infix_re = re.compile(r'(\w+)-(\w+)')  # Define an infix regex pattern for hyphens between words

    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.tokenizer = custom_tokenizer(custom_nlp)

for line in sample_lines:
    
    custom_doc = custom_nlp(line['text'])
    doc = nlp(line['text'])

    pretty_print(line, print_indexes=True)
    print(f'Original tokenizer: {[t.lemma_ for t in doc]}')
    print(f'Split             : {[t for t in line["text"].split()]}')
    print(f'Custom tokenizer  : {[t.lemma_ for t in custom_doc]}')
    print()

In [None]:
count_custom_nlp = count_different_tokenization_indexes(lines[:test_limit], custom_nlp, print_lines=True)
print(f'{count_custom_nlp}/{test_limit} sentences are tokenized differently')

Now we have another problem: sometimes the index in the annotations is off. We need to account for that as in some cases (e.g. in the second sample line) the word we need to replace will remain along with its explicit senses, that replaced an unrelated token instead.

In [None]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    similarity = intersection / union if union != 0 else 0  # Avoid division by zero
    return similarity

def find_word_in_window(lemmatized_tokens, window_span, sense_key, use_jaccard_similarity=False):
    """
    We have a few tokens and a sense_key, that contains the lemma of a word inside the text.
    We want to find the index of that word.
    """

    start_idx = window_span[0]
    end_idx = window_span[1]

    lemma_from_sense_key = sense_key[:sense_key.index('%')]

    target_idx = -1

    for index in range(start_idx, end_idx + 1):
        lemma = lemmatized_tokens[index]
        # print(f'Token [{token}] vs lemma [{lemma}]')

        if use_jaccard_similarity:
            if jaccard_similarity(set(lemma), set(lemma_from_sense_key)) >= 0.6:
                target_idx = index
                break
        else:
            if lemma == lemma_from_sense_key:
                target_idx = index
                break

    if target_idx == -1:
        raise ValueError(f'No match in window for token {sense_key} in window {lemmatized_tokens[start_idx:end_idx + 1]}')
    return target_idx

In [None]:
for sample_line in sample_lines:
    pretty_print(sample_line, print_indexes=True)
    lemmatized_tokens = [token.lemma_ for token in custom_nlp(sample_line['text'].lower())]
    for annotation in sample_line['annotations']:
        window_start = annotation['token_span'][0]
        window_end = annotation['token_span'][1]
        idx = find_word_in_window(lemmatized_tokens, (window_start, window_end), annotation['label'], use_jaccard_similarity=True)
        print(f'{annotation["label"]} in range ({window_start} - {window_end}) -> true index = {idx}')
    print()

We can build the final preprocessing routine

In [None]:
def preprocess_line(line, 
                    model,
                    stopwords=[], 
                    punctuation=[],
                    lemmatize=True,
                    filtering_regex=None, 
                    replace_with_sense_key=True,
                    use_jaccard_similarity=False,
                    produce_both=False
                    ):


    if stopwords is None:
        stopwords = []

    if punctuation is None:
        punctuation = []

    # Take text and annotations from the line
    text = line['text'].lower()
    annotations = line['annotations']

    # Indexes of the words we may want to replace with a sense key
    # {token1_index: token1, ...}
    tokens = model(text)
    annotations_data = {(annotation['token_span'][0], annotation['token_span'][1]): annotation['label'] for annotation in annotations}
    annotation_indexes = {find_word_in_window(
        [token.lemma_ for token in tokens], 
        (k[0], k[1]), 
        v, 
        use_jaccard_similarity=use_jaccard_similarity
    ): v for k, v in annotations_data.items()}
    
    # This will be then joined to create a new preprocessed sentence
    new_tokens = []
    new_semantic_tokens = []
    
    # For each token, check if it is a stopword, punctuation, if it matches the filter regex 
    # and if it needs to be retained or replaced with its sense key. Doc now tokenizes as the
    # standard string split, so there will be no differences in the token indexes
    for i, token in enumerate(tokens):

        token_text = token.lemma_ if lemmatize else token.text
        
        # The token is a word we can replace with the lemma key
        if i in annotation_indexes.keys():

            # We may want to produce both sentences, one with lemma key and one without
            # in order to train a model for word embeddings and another model with
            # sense embeddings
            if produce_both:
                new_tokens.append(token_text)
                new_semantic_tokens.append(annotation_indexes[i]) # [i][1])
            else:
                if replace_with_sense_key:
                    # print(f'Token {token_text:20s} is good, we keep it as {annotation_indexes[i][1]}')
                    new_tokens.append(annotation_indexes[i]) # [i][1])
                else:
                    # print(f'Token {token_text:20s} is good, we keep it as {token_text}')
                    new_tokens.append(token_text)
                
        # The token is a stopword
        elif token_text in stopwords:
            # print(f'Token {token_text:20s} is a stopword')
            continue

        # The token is punctuation
        elif token_text in punctuation:
            # print(f'Token {token_text:20s} is punctuation')
            continue

        # The token matches the regex, it is not valid
        elif filtering_regex is not None and filtering_regex.search(token_text):
            # print(f'Token {token_text:20s} is filtered out from regex')
            continue

        # The token is valid, we can retain it
        else:
            # print(f'Token {token_text:20s} is good, we keep it')
            new_tokens.append(token_text)
            if produce_both:
                new_semantic_tokens.append(token_text)

    return new_tokens, new_semantic_tokens


In [None]:
for sample_line in sample_lines:
    
    print([token for token in custom_nlp(sample_line['text'])])

    print('Data before: ')
    print(sample_line['text'])

    try:
        print('Preprocessed data:')
        print(" ".join(preprocess_line(sample_line, custom_nlp, stopwords=stop_words, filtering_regex=regexp_alphbetic)[0]))
    except:
        print('Exception')

    print('Preprocessed data with jaccard similarity:')
    print(" ".join(preprocess_line(sample_line, custom_nlp, stopwords=stop_words, filtering_regex=regexp_alphbetic, use_jaccard_similarity=True)[0]))
    print()

# Preprocessing with STANZA

# Actual preprocessing

In [None]:
# Create the actual datasets 
original_sentences = []
non_semantic_tokens = []
semantic_tokens = []
discarded_count = 0
for line in tqdm(lines):

    # Without jaccard similarity, ~5% of the sentences get discarded
    # With jaccard similarity, ~2.4% of the sentences get discarded
    try:
        
        non_semantic, semantic = preprocess_line(
            line, 
            custom_nlp, 
            stopwords=stop_words, 
            filtering_regex=regexp_alphbetic, 
            produce_both=True, 
            use_jaccard_similarity=True
        )

        original_sentences.append(line['text'])
        non_semantic_tokens.append(non_semantic)
        semantic_tokens.append(semantic)
    except:
        print('Discarded: ')
        pretty_print(line, print_indexes=True)
        print()
        discarded_count += 1

print(f'Original sentence                : {original_sentences[0]}')
print(f'Preprocessed sentence            : {" ".join(non_semantic_tokens[0])}')
print(f'Preprocessed sentence with senses: {" ".join(semantic_tokens[0])}')
print(f'Discarded {discarded_count} out of 500000')

Now despite discarding some sentences at least we can rest assured that only senses for the specified tokens appear in the dataset (before we could've had "word", "word%1:10:00" and so on and this would've affected the similarity score)

We could filter out uncommon words that have a low frequency but we can do the same specifying the `min_count` parameter of the Word2Vec model

In [None]:
def filter_frequency(sentences, frequency):

    ## remove words that appear only once
    frequency_dict = defaultdict(int)
    for sentence in tqdm(sentences):
        # tokens = sentence.split()
        # for token in tokens:
        for token in sentence:
            frequency_dict[token] += 1

    texts = [[token for token in sentence.split() if frequency_dict[token] > frequency]
            for sentence in sentences]

    return texts

In [None]:
# ff_word_sentences = filter_frequency(word_sentences, 1)
# ff_sense_sentences = filter_frequency(sense_sentences, 1)

# for i in range(10):
#   print(ff_word_sentences[i])

Save the preprocessed dictionary to file

In [None]:
# Specify the output JSONLines file. Set it to None to skip saving the dataset
# output_file = 'preprocessed_dataset_new.jsonl'
output_file = None

if output_file is not None:

    # Open the JSONLines file for writing
    with jsonlines.open(output_file, mode='w') as writer:

        for original_sentence, non_semantic_tokens_sentence, semantic_tokens_sentence in tqdm(zip(original_sentences, non_semantic_tokens, semantic_tokens)):

            # Create a dictionary for each set of strings
            data = {
                'original': original_sentence,
                'non_semantic': non_semantic_tokens_sentence,
                'semantic': semantic_tokens_sentence
            }
            
            # Write the dictionary to the JSONLines file
            writer.write(data)

# Load the preprocessed dictionary from file

In [None]:
# Read the preprocessed dictionary from file
input_file = 'preprocessed_dataset_new.jsonl'
# input_file = None

read_non_semantic_tokens = []
read_semantic_tokens = []

if input_file is not None:

    """
    with jsonlines.open(input_file) as reader:
        for line in tqdm(reader.iter()):
            read_non_semantic_sentences.append(line['non_semantic'])
            read_semantic_sentences.append(line['semantic'])

    non_semantic_sentences = [[token for token in sentence.split()] for sentence in read_non_semantic_sentences]
    semantic_sentences = [[token for token in sentence.split()] for sentence in read_semantic_sentences]
    """

    with jsonlines.open(input_file, 'r') as reader:
        for line in tqdm(reader):
            non_semantic_tokens = line.get("non_semantic", [])
            semantic_tokens = line.get("semantic", [])
            read_non_semantic_tokens.append(non_semantic_tokens)
            read_semantic_tokens.append(semantic_tokens)
        
        non_semantic_tokens = read_non_semantic_tokens
        semantic_tokens = read_semantic_tokens

# Simlex

In [None]:
def load_simlex(simlex_zip_path, simlex_dataset_path):
  simlex_pairs = dict()
  with zipfile.ZipFile(simlex_zip_path, 'r') as zipf, open(simlex_dataset_path, "wb") as fw:
    with zipf.open('SimLex-999/SimLex-999.txt') as myfile:
      next(myfile)
      for line in myfile:
        w1, w2, pos, score, *_ = line.strip().split()
        w1 = w1.decode('utf-8')
        w2 = w2.decode('utf-8')
        score = float(score)
        simlex_pairs[(w1, w2)] = score
        fw.write(f'{w1}\t{w2}\t{score}\n'.encode('utf-8'))
        # print(f'{w1}\t{w2}\t{score}')
  return simlex_pairs


simlex_data = 'SimLex-999.zip'
simple_simlex_path = "data/simlex999/simlex999.tsv"

if not os.path.exists(simlex_data):
    simlex_data = wget.download("https://fh295.github.io/SimLex-999.zip")

simlex_pairs = load_simlex(simlex_data, simple_simlex_path)
simlex_pairs

In [None]:
# The returned dictionary should be similar to previous word_pair2score 
# but instead of words we consider the senses from the dataset 
# associated with this words
def load_semantic_simplex(path):
  senses2score = dict()
  with open(path) as fr:
    next(fr)
    for line in fr:
      chunks = line.strip().split()
      w1 = chunks[0]
      w2 = chunks[1]
      sim_lex_score = float(chunks[3])
      senses_w1 = chunks[10].split(",")
      senses_w2 = chunks[11].split(",")
      senses2score[(tuple(senses_w1), tuple(senses_w2))] = sim_lex_score

  return senses2score

senses2score = load_semantic_simplex(r'data/simlex999/semantic_simlex_v0.1.tsv')
senses2score

In [None]:
# Check if the two are equals for the non semantic case
def check_equals_non_semantic(semantic_pairs, non_semantic_path):
    with open(non_semantic_path) as fr:
        next(fr)
        for line in fr:
            chunks = line.strip().split()
            w1 = chunks[0]
            w2 = chunks[1]
            sim_lex_score = float(chunks[3])

            # print(f'Evaluating ({w1}, {w2})')

            if (w1, w2) not in semantic_pairs:
                print(f'({w1}, {w2}) not in simlex999')
            else:
                semantic_score = semantic_pairs[(w1, w2)]
                if sim_lex_score != semantic_score:
                    print(f'({w1}, {w2}) have different scores ({sim_lex_score} vs {semantic_score})')

# check_equals_non_semantic(simlex_pairs, r'data/simlex999/semantic_simlex_v0.1.tsv')

# Evaluation metrics

In [None]:
def compute_correlation_score(model, word_pair2score, print_warning=True):
    human_scores = []
    system_scores = []
    count_print_warnings = 0
    for (w1, w2), score in word_pair2score.items():
        if (w1 not in model) or (w2 not in model):
            system_scores.append(-1)
            human_scores.append(score)
            if print_warning:
                print(f"({count_print_warnings:6d}) | WARNING ({w1} and {w2}) are not present in the embedding model!!" )
                count_print_warnings += 1
            continue
        system_similarity = model.similarity(w1, w2)
        human_scores.append(score)
        system_scores.append(system_similarity)

    human_scores = np.array(human_scores)
    system_scores = np.array(system_scores)
    pearson_r, _ = scipy.stats.pearsonr(human_scores, system_scores)    # Pearson's r
    spearman_rho = scipy.stats.spearmanr(human_scores, system_scores).statistic   # Spearman's rho
    
    return pearson_r, spearman_rho



def compute_semantic_correlation_score(model, senses2score,  print_warning=True):
    human_scores = []
    system_scores = []
    for (senses_1, senses_2), score in senses2score.items():
        senses_1_in_model = [s for s in senses_1 if s in model]
        senses_2_in_model = [s for s in senses_2 if s in model]

        if len(senses_1_in_model) == 0 or len(senses_2_in_model) == 0:
            # sense is not present in the model
            s1_str = " ".join(senses_1)
            s2_str = " ".join(senses_2)
            if print_warning:
                print(f"WARNING ({s1_str} and {s2_str}) are not present in the embedding model!!" )
            system_scores.append(-1)
            continue
        # Calculate semantic similarities between all pairs of senses
        all_similarities = []
        for s1 in senses_1_in_model:
            for s2 in senses_2_in_model:
                all_similarities.append(model.similarity(s1, s2))

        system_similarity = sum(all_similarities) / len(all_similarities)
        human_scores.append(score)
        system_scores.append(system_similarity)
    human_scores = np.array(human_scores)
    system_scores = np.array(system_scores)
    # Calculate Pearson's r (Pearson correlation coefficient) and Spearman's rho (Spearman rank correlation coefficient)
    pearson_r, _ = scipy.stats.pearsonr(human_scores, system_scores)    # Pearson's r
    spearman_rho = scipy.stats.spearmanr(human_scores, system_scores).statistic   # Spearman's rho
    return pearson_r, spearman_rho

# Word embeddings

1. `Word2Vec` is a word embedding technique that learns vector representations of words, capturing semantic relationships through context.
2. `GloVe` combines global and local word context through matrix factorization to create word embeddings.
3. `FastText` represents words as character n-grams, handling morphological details and out-of-vocabulary words.
4. `Doc2Vec` extends Word2Vec to learn document-level embeddings by treating each document as a unique word.
5. `BERT`, a transformer-based model, learns contextual word embeddings by considering both left and right sentence context.
6. `ELMo` uses bi-directional LSTM to create word embeddings based on entire sentence context, enhancing syntactic and semantic understanding.
7. `USE` is a universal sentence encoder for generating fixed-length embeddings, applicable to various NLP tasks.

More [here](https://medium.com/@vaibhav1403/embedding-techniques-in-natural-language-processing-nlp-29e424ab0cd9).

# Word2Vec

In [None]:
non_semantic_tokens[:3]

In [None]:
"""
vector_size = 10
window = 2
min_count = 1 

non_semantic_embeddings_model = gensim.models.Word2Vec(non_semantic_tokens, vector_size=vector_size, window=window, min_count=min_count)
non_semantic_embeddings_model_score = compute_correlation_score(non_semantic_embeddings_model.wv, simlex_pairs, print_warning=True)
print(f'Pearson and Spearman scores: {non_semantic_embeddings_model_score}')
"""

We can search for the best hyperparameters.
- The `vector_size` determines the dimensionality of the word vectors or embeddings.
- The `window_size` defines the context window for the Word2Vec model.
- The `min_count` parameter is used to control the minimum count of a word in the corpus for it to be considered during the training of the Word2Vec model.

In [None]:
def find_best_params(model, word_vector_retrieval_function, correlation_score_function, train_dataset, evaluation_dataset, param_grid):

    best_score = 0
    best_params = None

    # Perform grid search
    score_dict = dict()
    pbar = tqdm(ParameterGrid(param_grid))
    for params in pbar:

        # Train Word2Vec model
        nlp = model(train_dataset, **params)
        
        pearson_score, spearman_score = correlation_score_function(word_vector_retrieval_function(nlp), evaluation_dataset, print_warning=False)

        # Normalize the scores [-1, 1] -> [0, 1]
        pearson_score_norm = (pearson_score + 1) / 2
        spearman_score_norm = (spearman_score + 1) / 2

        # Compute the combined score using a weighted average
        alpha = 0.5
        combined_score = alpha * pearson_score_norm + (1 - alpha) * spearman_score_norm

        score_dict[(params['vector_size'], params['window'], params['min_count'])] = combined_score
        # pbar.set_description(f'Params {params} gives pearson {pearson_score} ({pearson_score_norm}) and spearman {spearman_score} ({spearman_score_norm}) scores ({combined_score})')
        pbar.set_postfix(pearson=pearson_score_norm, spearman=spearman_score_norm, refresh=False)

        # Update best parameters if current score is better
        if combined_score > best_score:
            best_score = combined_score
            best_params = params

    return best_params, best_score, score_dict

In [None]:
"""
# Set up your grid of hyperparameters
param_grid = {
    'vector_size': [16, 64, 128, 256, 512],
    'window': [2, 4, 6],
    'min_count': [1, 3, 6]
}
# This gives the best params of window_size = 2, min_count = 6, vector_size = 512 
"""

param_grid = {
    'vector_size': [128, 256, 512, 1024],
    'window': [2],
    'min_count': [6]
}

# Print and use the best parameters
best_params, best_score, score_dict = find_best_params(Word2Vec, lambda model: model.wv, compute_correlation_score, non_semantic_tokens, simlex_pairs, param_grid)
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

In [None]:
non_semantic_embeddings_model = gensim.models.Word2Vec(non_semantic_tokens, **best_params)
non_semantic_embeddings_model_score = compute_correlation_score(non_semantic_embeddings_model.wv, simlex_pairs, print_warning=True)
print(f'Pearson and Spearman scores: {non_semantic_embeddings_model_score}')

In [None]:
"""
# Extract hyperparameter values and loss
min_count_values = [key[0] for key in score_dict.keys()]
vector_size_values = [key[1] for key in score_dict.keys()]
window_size_values = [key[2] for key in score_dict.keys()]
score_values = list(score_dict.values())

# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(window_size_values, min_count_values, vector_size_values, c=score_values, cmap='viridis', marker='o')

# Add a color bar
cbar = fig.colorbar(scatter, ax=ax, pad=0.1, fraction=0.05)

# Label axes
ax.set_xlabel('Min Count')
ax.set_ylabel('Vector Size')
ax.set_zlabel('Window Size')
ax.set_title('Score')

plt.show()
"""


In [None]:
tsv_file_path = 'data/student_predictions/non_semantic_word2vec.tsv'

with open(tsv_file_path, 'w') as tsv_file:

    # Write the header
    # tsv_file.write('w1\tw2\tscore\n')

    for (w1, w2), score in simlex_pairs.items():
        if (w1 not in non_semantic_embeddings_model.wv) or (w2 not in non_semantic_embeddings_model.wv):
            system_similarity = np.nan
        else:
            # The cosine similarity ranges from -1 (completely dissimilar) to 1 (completely similar). 
            # A value of 0 means that the vectors are orthogonal or uncorrelated.
            system_similarity = non_semantic_embeddings_model.wv.similarity(w1, w2)

        tsv_file.write(f'{w1}\t{w2}\t{system_similarity}\n')


# GloVe comparison

In [None]:
glove_comparison = False
if glove_comparison:
    glove_pretrained = api.load("glove-wiki-gigaword-50")
    glove_word_vectors = {word: glove_pretrained[word] for word in glove_pretrained.index_to_key}    
    non_semantic_embeddings_glove_score = compute_correlation_score(glove_pretrained, simlex_pairs, print_warning=True)
    print(f'Pearson and Spearman scores for GloVe: {non_semantic_embeddings_glove_score}')

# BERT

# Sense embeddings

In [None]:
semantic_tokens[:3]

In [None]:
"""
vector_size = 10
window = 2
min_count = 1 

semantic_embeddings_model = gensim.models.Word2Vec(semantic_tokens, vector_size=vector_size, window=window, min_count=min_count)
semantic_embeddings_model
"""

In [None]:
"""
# Set up your grid of hyperparameters
param_grid = {
    'vector_size': [16, 64, 128, 256, 512],
    'window': [2, 4, 6],
    'min_count': [1, 3, 6]
}
"""

param_grid = {
    'vector_size': [128, 256, 512, 1024],
    'window': [2],
    'min_count': [6]
}

# Print and use the best parameters
best_semantic_params, best_semantic_score, _ = find_best_params(Word2Vec, lambda model: model.wv, compute_semantic_correlation_score, semantic_tokens, senses2score, param_grid)
print(f'Best parameters: {best_semantic_params}')
print(f'Best score: {best_semantic_score}')

In [None]:
semantic_embeddings_model = Word2Vec(semantic_tokens, **best_semantic_params)

In [34]:
tsv_file_path = 'data/student_predictions/semantic_word2vec.tsv'

with open(tsv_file_path, 'w') as tsv_file:

    for (senses_1, senses_2), score in senses2score.items():
        senses_1_in_model = [s for s in senses_1 if s in semantic_embeddings_model.wv]
        senses_2_in_model = [s for s in senses_2 if s in semantic_embeddings_model.wv]
        
        all_similarities = []
        if len(senses_1_in_model) == 0 or len(senses_2_in_model) == 0:
            # Sense is not present in the model
            s1_str = " ".join(senses_1)
            s2_str = " ".join(senses_2)
            print(f"WARNING ({s1_str} and {s2_str}) are not present in the embedding model!!" )
            all_similarities.append(-1)
        # Calculate semantic similarities between all pairs of senses
        else:
            for s1 in senses_1_in_model:
                for s2 in senses_2_in_model:
                    all_similarities.append(semantic_embeddings_model.wv.similarity(s1, s2))
        w1 = senses_1[0].split('%')[0]
        w2 = senses_2[0].split('%')[0]
        tsv_file.write(f'{w1}\t{w2}\t{max(all_similarities)}\n')  
