## **Import the necessary libraries**

In [None]:
import nltk
import math
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
from itertools import combinations
wnl = nltk.stem.WordNetLemmatizer()
nltk.download('wordnet_ic')
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.


## **Declare the pairs and pos mapping to Wordnet**



In [None]:
# List of (lemma, category) pairs
pairs = [
    ('the', 'DT'), ('man', 'NN'), ('swim', 'VB'), ('with', 'PR'), ('a', 'DT'),
    ('girl', 'NN'), ('and', 'CC'), ('a', 'DT'), ('boy', 'NN'), ('whilst', 'PR'),
    ('the', 'DT'), ('woman', 'NN'), ('walk', 'VB')
]

# Mapping from custom POS tags to WordNet POS tags
pos_mapping = {
    'NN': wn.NOUN,    # Nouns
    'VB': wn.VERB,    # Base form of verbs
    'JJ': wn.ADJ,     # Adjectives
    'RB': wn.ADV      # Adverbs
}

## **Filtering out closed-class words**


In [None]:
def remove_closed_words(list_of_lemmas, allowed_lemmas):
    # Filter out lemmas where the category (POS tag) is not in the allowed_lemmas keys
    filtered_lemmas = [(lemma, category) for lemma, category in list_of_lemmas if category in allowed_lemmas.keys()]
    return filtered_lemmas

# Filter the pairs
filtered_pairs = remove_closed_words(pairs, pos_mapping)
print("Filtered Pairs:")
print(filtered_pairs)

Filtered Pairs:
[('man', 'NN'), ('swim', 'VB'), ('girl', 'NN'), ('boy', 'NN'), ('woman', 'NN'), ('walk', 'VB')]


## **Get most frequent synset**

In [None]:
def get_most_frequent_synset(word, pos):
    synsets = wn.synsets(word, pos=pos)
    if synsets:
        # The first synset is the most frequent one
        return synsets[0]
    else:
        return None  # No synset found for this word with the given POS


## **Find the absolute max for each differenc POS in Wordent**

In [None]:
def find_max_depth(pos):
    max_depth = 0
    for synset in wn.all_synsets(pos):
        # Get the length of the longest hypernym path for this synset
        for path in synset.hypernym_paths():
            max_depth = max(max_depth, len(path))
    return max_depth

# Calculate the maximum depth for relevant POS
max_depth_verbs = find_max_depth(wn.VERB)
max_depth_nouns = find_max_depth(wn.NOUN)
max_depth_adjs = find_max_depth(wn.ADJ)
max_depth_advs = find_max_depth(wn.ADV)

# Map POS to their max depth and calculate max_lch
max_depth_mapping = {
    wn.NOUN: max_depth_nouns,
    wn.VERB: max_depth_verbs,
    wn.ADJ: max_depth_adjs,
    wn.ADV: max_depth_advs
}

## **Compute the similarities**

In [None]:
# Function to compute similarities between two synsets
def compute_similarities(syn1, syn2):
    # Compute the Least Common Subsumer
    lcs = syn1.lowest_common_hypernyms(syn2)
    similarities = {}

    # Path Similarity
    path_sim = syn1.path_similarity(syn2)
    similarities['path_similarity'] = path_sim

    # Leacock-Chodorow Similarity
    if syn1.pos() == syn2.pos():
        max_depth = max_depth_mapping[syn1.pos()]
        max_lch = math.log(2 * max_depth)
        lch_sim = syn1.lch_similarity(syn2)
        normalized_lch = (lch_sim - 0) / (max_lch - 0)  # Normalize between min=0 and max_lch
    else:
        normalized_lch = None
    similarities['lch_similarity'] = normalized_lch

    # Wu-Palmer Similarity
    wup_sim = syn1.wup_similarity(syn2)
    similarities['wup_similarity'] = wup_sim

    # Lin Similarity (might throw an error if information content is missing)
    try:
        lin_sim = syn1.lin_similarity(syn2, brown_ic)
    except:
        lin_sim = None
    similarities['lin_similarity'] = lin_sim

    return similarities, lcs

## **Analyze similarities using the most frequent synsets**

In [None]:
# Function to analyze similarities using the most frequent synsets
def analyze_similarities_mfs(pairs, pos_mapping):
    results = []
    # Generate all unique combinations of word pairs using built-in function combinations
    for (word1, pos1), (word2, pos2) in combinations(pairs, 2):
        wn_pos1 = pos_mapping.get(pos1)
        wn_pos2 = pos_mapping.get(pos2)

        syn1 = get_most_frequent_synset(word1, wn_pos1)
        syn2 = get_most_frequent_synset(word2, wn_pos2)


        similarities, lcs = compute_similarities(syn1, syn2)

        # If LCS is found, continue with the analysis, otherwise add a custom message
        if not lcs:
            results.append({
                'word1': word1,
                'synset1': syn1,
                'word2': word2,
                'synset2': syn2,
                'lcs': None,
                'similarities': None,
                'message': f"Cannot compute similarities because the synsets for '{word1}' and '{word2}' do not have the same POS_tag and a least common subsumer (LCS) in WordNet."
            })
        else:
            results.append({
                'word1': word1,
                'synset1': syn1,
                'word2': word2,
                'synset2': syn2,
                'lcs': lcs,
                'similarities': similarities,
                'message': None
            })
    return results

## **Show the results**

In [None]:
def display_results(results):
    for res in results:
        word1 = res['word1']
        syn1 = res['synset1']
        word2 = res['word2']
        syn2 = res['synset2']
        lcs = res['lcs']
        sims = res['similarities']
        message = res['message']

        print(f"\nWord Pair: '{word1}' ({syn1.name()}) - '{word2}' ({syn2.name()})")

        if lcs:
            print(f"Least Common Subsumer (LCS): {lcs[0].name()}")
            print("Similarity Measures:")
            print(f"  Path Similarity: {sims['path_similarity']}")
            print(f"  Leacock-Chodorow Similarity (Normalized): {sims['lch_similarity']}")
            print(f"  Wu-Palmer Similarity: {sims['wup_similarity']}")
            print(f"  Lin Similarity: {sims['lin_similarity']}")
            print("-------------------------------------------------")
        else:
            # If no common LCS, print the custom message
            print("Least Common Subsumer (LCS): None")
            print(message)
            print("-------------------------------------------------")

# Run the analysis using the most frequent synsets
similarity_results_mfs = analyze_similarities_mfs(filtered_pairs, pos_mapping)
display_results(similarity_results_mfs)


Word Pair: 'man' (man.n.01) - 'swim' (swim.v.01)
Least Common Subsumer (LCS): None
Cannot compute similarities because the synsets for 'man' and 'swim' do not have the same POS_tag and a least common subsumer (LCS) in WordNet.
-------------------------------------------------

Word Pair: 'man' (man.n.01) - 'girl' (girl.n.01)
Least Common Subsumer (LCS): adult.n.01
Similarity Measures:
  Path Similarity: 0.25
  Leacock-Chodorow Similarity (Normalized): 0.6102915062989643
  Wu-Palmer Similarity: 0.631578947368421
  Lin Similarity: 0.7135111237276783
-------------------------------------------------

Word Pair: 'man' (man.n.01) - 'boy' (male_child.n.01)
Least Common Subsumer (LCS): male.n.02
Similarity Measures:
  Path Similarity: 0.3333333333333333
  Leacock-Chodorow Similarity (Normalized): 0.6882778097361639
  Wu-Palmer Similarity: 0.6666666666666666
  Lin Similarity: 0.7294717876200584
-------------------------------------------------

Word Pair: 'man' (man.n.01) - 'woman' (woman.n.0

## **Analysing and Conclusions**

* In our given pairs we have different lemmas together with their POS_tag , but
since Wordnet uses only open-classes(nouns, verbs, adjectives and adverbs) we don't have to consider all of them. So we filter the pairs in a new list containing only the lemmas and POS_tags allowed by Wordnet.


* We need to find the most frequent synset for each lemma because words in natural language often have multiple meanings, and each distinct meaning is represented by a different synset in WordNet. In semantic similarity calculations (such as computing the Least Common Subsumer or other similarity metrics), the most frequent synset is generally assumed to represent the most common or likely meaning of the word in everyday usage. From the documentation of WordNet it is stated that the synsets are sorted based on their frequency, so we used the first one (index 0).

* Before computing similarities, we know that Leacock-Chodorow Similarity needs normalization. So to normalize it we have used a Min-Max approach that in our case would involve finding the minimum and maximum possible values of the LCH similarity, then scaling the actual LCH similarity to the [0, 1] range.
 1. The minimum possible LCH similarity occurs when the two synsets are maximally distant in the hierarchy so we consider the min_lch = 0
         * Since the shortest common path will be 2 * MaxDepth, and -log(2 * MaxDepth/ 2 * MaxDepth) = 0
 2.  The maximum LCH similarity value occurs when two synsets are the same. In that case, the shortest path is 1 , so the formula will now be:

            * LCH(s,s) = - log(1/2 * MaxDepth) = log (2 * MaxDepth)
 3. Normalize the Leacock-Chodorow Similarity by using the new formula instead:
            * NomalizedLCH(s1, s2) = (LCH(s1,s2) - min_lch)/(max_lch - min_lch)
            where as we said : min_lch is 0 and max_lch is log(2*MaxDepth)

  4. In order to find the absolute max depth we created the above function that for each POS_tag finds the max depth, beacuse max depth is different for different POS_tags. For example in our cases , max_lch for nouns was 20 and for verbs was 13, so we have to rely on these different values for different POS_tag. Since we are using an approximation for nouns, if you compute the similiarity of two similiar words will not be exactly 1.0 but it is closed to 1 (0.99).


* The other step is to compute the Least Common Subsumer (LCS) and all the similiarities. Let's explain shortly what each of them do and try to find the better one for our case:

    1. ***LCS*** in WordNet is the most specific ancestor synset that two synsets share in the hypernym hierarchy. So we can say it is the most specific concept that is a hypernym of both synsets.
    2. ***Path Similarity*** measures the similarity between two synsets based on the shortest path connecting them in the WordNet hierarchy. The shorter the path, the more similar the synsets are assumed to be. It has a limitations that it treats all edges in the hierarchy equally so a long path through very specific synsets may get the same score as a short path through more general synsets.
    3. ***Leacock-Chodorow Similarity*** measures the similarity between two synsets based on the shortest path between them and the maximum depth of the taxonomy. It takes into account the overall size of the taxonomy and normalizes the path length based on it. It is better than path similarity at handling hierarchies of different sizes and complexity.
    4. ***Wu-Palmer*** calculates the similarity between two synsets based on the depth of LCS and the depths of the individual synsets, so two synsets are more similar if they share a common ancestor that is closer to them in the hierarchy.It focuses on the structural similarity of two synsets based on how far they are from their common ancestor, making it effective in capturing hierarchical relations.
    5. ***Lin similarity*** measures the similarity between two synsets based on their information content. The IC of a synset is derived from the probability of encountering that synset in a large corpus of text, where more specific synsets have higher information content. Lin similarity compares how much information is shared by the two synsets relative to their total information content. It takes into consideration the hierarchical structure of WordNet and statistical information from large corpora, making it sensitive to how commonly or rarely concepts are used in the language.
            *For the lemmas that does not have the same POS_tag, they also does not have a least common subsumer (LCS)
            in WordNet, so we cannot compute their similarities.


**Conclusion**

We think that Path similarity tends to give lower scores for very close synsets (for example for girl and women it gives only 0.5 which is low compared to other measures). In contrast, Leacock-Chodorow similarity can be considered a better option comapred to the Path similarity, but since we also need to normalize it, it can take longer time to compute.

We think that Wu-Palmer and Lin similarities tend to perform better based on the lemma pairs. Since the Lin similarity is based on the information content (IC), it may be better to use it when we are looking for contex-aware measure how words are used in real world texts. On the other hand, Wu-Palmer consideres hierarchical structure and the closeness in taxonomy, hence it may be a good option when working with WordNet's structure.


