## Use Sense Vectors to identify Word sense

#### Library Imports

In [1]:
# Import Cell
import os
import sys
import time
import math
import re
from collections import defaultdict
import ast
from tqdm import tqdm
import multiprocessing

from gensim.models import KeyedVectors
import spacy
import pandas as pd
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 
import plotly.graph_objects as go
from plotly.io import write_image, write_html
import igraph
from igraph import Graph, EdgeSeq

Append the github repo to system path, and import modules from it

In [2]:
# Import Cell 2
sys.path.append("sensegram_package/")
import sensegram
from wsd import WSD

#### Constants

In [3]:
# Constants
load_vectors = True # Flag to decide to load sense and word vectors, to word with WSD module
load_data = True # Flag to decide to load raw data
generate_sense_embeddings = False # Flag to decide to use vectors to generate sense embeddings for each noun in corpus
generate_hypernymy_flag = True # Flag to decide to use wordnet to generate hypernymy map for each noun in the data

---------

#### Load data

In [4]:
# Data file paths
data_directory = os.path.join(os.getcwd(), "data")
outputs_directory = os.path.join(os.getcwd(), "output")

corpus_fpath = os.path.join(data_directory, "0.txt")
sense_vectors_fpath = os.path.join(data_directory, "model", "0.sense_vectors")
# word_vectors_fpath = os.path.join(data_directory, "model", "word2vec_twitter_tokens.word_vectors.cbow1-size300-window5-iter5-mincount10-bigramsFalse.word_vectors")
word_vectors_fpath = os.path.join(data_directory, "model", "0.word_vectors")
corpus_data_sense_mappings_file = os.path.join(data_directory, "corpus_sense_mapping.csv")
hhm_mappings_file = os.path.join(data_directory, "hhm_mappings.csv")

Load the sense and word vector files. This may take some time, owing to the large file size of the vector files

In [5]:
# Vector & corpus loader
if load_vectors:
    s = time.time()

    if os.path.exists(sense_vectors_fpath) and os.path.exists(word_vectors_fpath):
        sense_vectors = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath, binary=False)
        word_vectors = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore")
        print(f"Took {time.time()-s}seconds to load vector files")
    else:
        print("Could not find vector files. Check file paths and ensure the right files exists")
    del s

if load_data:
    print("Reading the corpus now!")
    with open(corpus_fpath, "r") as f:
        corpus_data = f.read()

Took 10.56437087059021seconds to load vector files
Reading the corpus now!


#### Get all senses of a word

Using the sense vectors, load all possible senses for the given word.  
The output prints the sense *Word#&lt;sense-number&gt;* followed by the probabilities of that word matching other words with similar sense. This table can help us provide logical names of different sense groups. For example, running the code for the word "**table**" gives the following output -  
```
Probabilities of the senses:
[('Table#1', 1.0), ('Table#2', 1.0), ('Table#3', 1.0), ('Table#4', 1.0), ('table#1', 1.0), ('table#2', 1.0), ('table#3', 1.0), ('table#4', 1.0)]


Table#1
====================
table#1 0.996316
TABLE#1 0.993647
PAGE#2 0.989991
page#2 0.989991
WINDOW#2 0.989900
Window#3 0.989900
window#2 0.989900
Scale#2 0.989745
scale#2 0.989745
SCALE#2 0.989745


Table#2
====================
TABLE#2 1.000000
Row#3 0.869726
row#3 0.869726
ROW#3 0.856643
Stack#3 0.829349
Box#3 0.826571
BOX#2 0.826571
stack#3 0.825068
STACK#3 0.824239
BOWL#3 0.813412


Table#3
====================
TABLE#3 0.939938
table#3 0.934190
Boundary_Markers#5 0.845184
Catchment_Basins#2 0.826906
contents#2 0.825448
CONTENTS#2 0.825448
Contents#2 0.824324
tables#1 0.806271
NUMBERS#3 0.804637
Tables#1 0.796628
....

```
Few things we can see from the output - 
* Since we have set *ignore_case=True*, the output shows 4 senses for *Table*, and 4 for *table*.
* Looking at the related words for each sense, we can attribute the following logical groups to few of the senses - 
    - Table#2 - Data table.  
    - Table#3 - Table of contents.
    - table#4 - Hotel/Furniture.
    


In [6]:
# Sense extraction helper function
def print_word_senses(word, corpus_data, sense_vectors, ignore_case=False, only_in_corpus=False):
    senses = sense_vectors.get_senses(word, ignore_case=ignore_case)
    
    print("Probabilities of the senses:\n{}\n\n".format(senses))
    for sense_id, prob in senses:
        print(sense_id)
        print("="*20)
        for rsense_id, sim in sense_vectors.wv.most_similar(sense_id):
            sense_word = rsense_id.split("#")[0]
            sense_word = sense_word if ignore_case else sense_word.lower()
            count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(sense_word), corpus_data))
            if only_in_corpus and count==0:
                continue
            print(f"{rsense_id} {round(sim,4)} [{count}]")
        print("\n")

In [7]:
# Display senses of word
if load_vectors:
    test_word = "chat"
    print_word_senses(test_word, corpus_data, sense_vectors, ignore_case=True, only_in_corpus=True)

Probabilities of the senses:
[('chat#1', 1.0), ('chat#2', 1.0), ('chat#3', 1.0), ('chat#4', 1.0)]


chat#1
lesson#1 0.873 [123]
session#1 0.8582 [341]
presentation#1 0.846 [190]
lecture#1 0.8419 [105]
meeting#1 0.8344 [863]
workshop#1 0.8329 [113]
event#1 0.8145 [533]
discussion#1 0.8101 [172]
interview#1 0.8054 [658]
conversation#1 0.7903 [271]


chat#2
link#1 0.9341 [1197]
message#1 0.9073 [459]
thanks#2 0.907 [11141]
reply#1 0.9064 [459]
me#2 0.8931 [23831]
emails#1 0.8743 [308]
email#1 0.8723 [1610]
tweet#1 0.8705 [2553]
comment#1 0.8681 [584]
someone#4 0.8618 [2245]


chat#3
begin#1 0.9075 [156]
talk#1 0.8921 [1253]
meet#1 0.8913 [999]
forward#2 0.8897 [1527]
deal#2 0.8754 [496]
listen#1 0.8742 [747]
start#1 0.8595 [1684]
celebrate#2 0.8588 [166]
visit#1 0.8474 [538]
see#1 0.8465 [8681]


chat#4
talk#2 0.9712 [1253]
talked#2 0.9453 [114]
ranting#1 0.9271 [37]
rant#1 0.9125 [94]
talking#1 0.8902 [1129]
hearing#1 0.8849 [262]
moan#2 0.8848 [44]
fuss#1 0.8806 [35]
complained#1 0.856 

#### Get disambiguated sense of the word, using corpus as context

##### Input
To understand the word's sense in a given context, we use the *WSD* class from the sensegram library.  
The WSD model takes the following key parameters to decide word sense based on corpus context - 
* vectors - Both sense and word vector models loaded earlier.  
  
  
* method - To calculate the sense of the word, the library averages the sense scores of all the surrounding context words and compares it with different senses of the target word. For this comparison, there are two available metrics - 
 - sim: Uses cosine distance
 - prob: Use log probability score  
  
  
* window - This is the window(±) that the model looks into, to decide the word context.   
For example, if our target word is *table*,   
with the context of *"we load the our data into a data-frame table object and count the number of rows/columns using the .shape method"*  
 1. a window of 3 would consider the following 6(3 on the left, and 3 on the right) words around our context word to find the sense of the word - *into, a, data-frame, object, and, count*  
 2. a window of 5 would use the following context - *our, data, into, a, data-frame, table, object, and, count, the, number*  
  
  
* verbose - Allows to print intermediate outputs while running the disambiguation code

<hr>  
     
Some food-for-thought regarding the usage of WSD module - 
 - Do note that while stopwords like *and* and *the* are considered in the context of the the target word, they are dropped while disambiguating the sense of our target.
 - While it may seem ideal to choose a high value of window for getting the sense of the target word, it may happen that the wider window results in an less accurate output, as it averages across all possible senses.
 - The library considers, and disambiguates, only the first occurance of the target word in the context. For a large corpus, it would be ideal to first split the corpus and generate contexts using an external helper function, and then iteratively get the sense for the target word across all occurances in the corpus.

In [8]:
# Loading Word Sense Disambiguation model
if load_vectors:
    wsd_model = WSD(sense_vectors, word_vectors, window=15, method='prob', verbose=True)
else:
    print("Load vectors to initialize and work with WSD model")

In [9]:
# Extracting disambiguates senses
if load_vectors:
    print(wsd_model.disambiguate(corpus_data, "morning"))
else:
    print("Load vectors to initialize and work with WSD model")

Extracted context words:
['sell', 'yes', 'Im', 'bath', 'tell', 'haha', 'amazing', 'please', 'shes', 'list', 'captured', 'pants', 'uk', 'cctv']
Senses of a target word:
[('morning#1', 1.0)]
Significance scores of context words:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Context words:
sell	0.000
yes	0.000
Im	0.000
('morning#1', [0.6330168541350161])


<h5> Output </h5>  

Running the Sense disambiguation code generates following lines of output -  
1. Prints the context words extracted from the corpus.
- Prints possible senses of the word, with their respective probabilities(without considering the context)
- Prints the significance score of each context word.
- Prints the most significant context words.
- **Returns** a tuple of the sense of the word as derived from the context, and match scores(log-probability or cosine-similarity depending on the *method* chosen) of various senses of the target word.  
For instance, the output *('table#2', [0.2706353009709064, 0.9591583572384959, 0.40617065436041355, 0.6940131864117054])* indicates the following things regarding our target word -  
    - The closest sense of our target word is with *table#2*, with a match score of 0.959(second in the list)
    - For the other senses, the match score can be read as follows - 
    - table#1 - 0.2706
    - table#2 - 0.959
    - table#3 - 0.406
    - table#4 - 0.694

Since we had not defined the *ignore_case* argument while initializing the WSD model, it resorts to the default of True, and the output return scores for the 4 senses of the word *table*.  
If we chose to ignore case, the output would have match for 8 senses(4-Table; 4-table)
<hr> 

In [10]:
corpus_data.count("morning")

3086

#### Generate sense embeddings from corpus

In [11]:
def prepare_corpus_data_with_context(corpus_filepath, window=5, force_update=False):
    nlp = spacy.load('en')
    with open(corpus_filepath, "r") as f:
        corpus_data = f.readlines()
    out_file = os.path.join(os.path.dirname(corpus_filepath), f"window_{window}_context_corpus.txt")
    contextual_data = []
    if os.path.exists(out_file) and (not force_update):
        print("Found preexisting file. Loading context data from file")
        with open(out_file, "r") as f:
            contextual_data = f.readlines()
    else:
        print("No pre created corpus found, or force update flag is true. Generating contextual data from corpus")
        with open(out_file, "w") as f:
            for txt_line in corpus_data:
                nlp_data = nlp(txt_line.replace("\n", "").replace("-", ""))
                max_tokens = len(nlp_data)
                for i, tok in enumerate(nlp_data):
                    if "NN" in tok.tag_:
                        start = max(0, i-window)
                        end = min(i+window, max_tokens)
                        left_context = [t.text for t in nlp_data[start:i]] + [f"<{tok.text}>"]
                        right_context = [t.text for t in nlp_data[i+1:end]]
                        noun_in_context = f"<{tok.text}> - {' '.join(left_context + right_context)}"
                        contextual_data.append(noun_in_context)
                        f.write(noun_in_context + "\n")
    return contextual_data


In [12]:
def get_sense_group_from_corpus(context_data, wsd_model, sense_vectors):
    output = {
        "word": [],
        "context": [],
        "sense_id": [],
        "sense_group_name": [],
        "sense_group_num": [],
        "sense_probability": [],
        "related_senses": []
    }
    
    for row in tqdm(context_data, total=len(context_data)):
        word, ctx = row.split(' - ')
        sense_id, sense_probs = wsd_model.disambiguate(ctx, word)
        sense_probability = max(sense_probs)
        sense_group_name, sense_group_num = sense_id.split("#")
        try:
            related_senses = [r_senseid for r_senseid,_ in sense_vectors.wv.most_similar(sense_id)]
            related_senses_l2 = [r_senseid for related_sense in related_senses for r_senseid,_ in sense_vectors.wv.most_similar(related_sense)]
        except KeyError as e:
            print(f"Could not get related senses for {sense_id}")
            related_senses = []
            related_senses_l2 = []
        output["word"].append(word)
        output["context"].append(ctx)
        output["sense_id"].append(sense_id)
        output["sense_group_name"].append(sense_group_name)
        output["sense_group_num"].append(sense_group_num)
        output["sense_probability"].append(sense_probability)
        output["related_senses"].append(related_senses+related_senses_l2)
    
    output_df = pd.DataFrame(output)
    
    return output_df

In [1]:
def worker_func(p_name, tasks, results):
    while True:
        indices = tasks.get()
        if indices == None:
            print('[%s] evaluation routine quits' % p_name)

            # Indicate finished
            results.put("Done")
            break
        else:
            # Compute result and mimic a long-running task
            batch_df = get_sense_group_from_corpus(contextual_data[indices[0]:indices[1]], wsd_model2, sense_vectors)

            # Output which process received the value
            # and the calculation result
            print(f"{p_name} recieved computation of batch {indices}")

            # Add result to the queue
            results.put(batch_df)

In [13]:
window = 15

In [14]:
# Use WSD model to create file with Sense mappings
if (not os.path.exists(corpus_data_sense_mappings_file)) or generate_sense_embeddings:
    manager = multiprocessing.Manager()
    tasks = manager.Queue()
    results = manager.Queue()
    num_processes = os.cpu_count()
    pool = multiprocessing.Pool(processes=num_processes)
    processes = []
    
    base_df = pd.DataFrame({
        "word": [],
        "context": [],
        "sense_id": [],
        "sense_group_name": [],
        "sense_group_num": [],
        "sense_probability": [],
        "related_senses": []
    })
    wsd_model2 = WSD(sense_vectors, word_vectors, window=window, method='prob', verbose=False)
    contextual_data = prepare_corpus_data_with_context(corpus_fpath, window=window, force_update=True)
    num_lines = len(contextual_data)
    batch_size = int(num_lines/num_processes)+1
    
    for i in range(num_processes):
        proc_name = f"P_{i}"
        start_idx = i*batch_size
        end_idx = min(num_lines, batch_size*(i+1))
        tasks.put([start_idx, end_idx])
        new_process = multiprocessing.Process(target=worker_func, args=(proc_name,tasks,results))
        processes.append(new_process)
        new_process.start()
        
    for i in range(num_processes):
        tasks.put(None)
    
    num_finished_processes = 0
    while True: 
        new_result = results.get()
        if new_result == "Done":
            # Process has finished
            num_finished_processes += 1
            if num_finished_processes == num_processes:
                break
        else:
            output_df = output_df.append(new_result)

    output_df.to_csv(output_file, index=False)
else:
    sense_group_dataframe = pd.read_csv(corpus_data_sense_mappings_file)

In [15]:
sense_group_dataframe.head()

Unnamed: 0,word,context,sense_id,sense_group_name,sense_group_num,sense_probability,related_senses
0,<unread>,I did that too think its best <unread>,unread#1,unread,1,0.426093,"['exceeded#1', 'renewed#1', 'countless#1', 're..."
1,<shopping>,<shopping> list for baby penguins and seals co...,shopping#1,shopping,1,0.752219,"['gym#1', 'xmas#1', 'finish#2', 'drinks#1', 'p..."
2,<list>,shopping <list> for baby penguins and seals co...,list#1,list,1,0.563295,"['website#1', 'tumblr#1', 'page#1', 'facebook#..."
3,<baby>,shopping list for <baby> penguins and seals co...,baby#1,baby,1,0.782509,"['brother#1', 'yo#2', 'mummy#1', 'hubby#1', 'b..."
4,<penguins>,shopping list for baby <penguins> and seals co...,penguins#1,penguins,1,0.802287,"['ducks#1', 'scones#1', 'burgers#1', 'skirts#1..."


-----

#### Hypernymy extraction

In [16]:
# Constants for creating HHM map
weights = {
    "direct_match": 8,
    "direct_nomatch": 5,
    "l1_match": 3,
    "l1_nomatch": 2,
    "l2_match": 1,
    "l2_nomatch": 0.5,
}

nlp = spacy.load("en")
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [17]:
def generate_hypernymy(word, idx, hypernymy_dict, level="l1"):
    token = nlp(str(word))[0]
    synsets = token._.wordnet.synsets()
    for syn in synsets:
        if syn.name().split(".n.")[0] == word and int(syn.name().split(".n.")[1]) == idx:
            match_str = "match"
        else:
            match_str = "nomatch"

        hypernym_syn = syn.hypernyms()
        hyponym_syn = syn.hyponyms()
        meronym_syn = syn.part_meronyms()
        if len(hypernym_syn) >0:
            hypernym = hypernym_syn[0].name().split('.')[0]
            rev_map = {
#                 f"{word}#{idx}":syn,
                "hyponyms": [hyp.name().split(".")[0] for hyp in hyponym_syn],
                "meronyms": [mero.name().split(".")[0] for mero in meronym_syn]
            }
            
            if level in hypernymy_dict[hypernym]["rev_map"]:
                hypernymy_dict[hypernym]["rev_map"][level].append(rev_map) 
            else:
                hypernymy_dict[hypernym]["rev_map"][level] = [rev_map]

            if "sum_weight" in hypernymy_dict[hypernym]:
                hypernymy_dict[hypernym]["sum_weight"] += weights[f"{level}_{match_str}"]
            else:
                hypernymy_dict[hypernym]["sum_weight"] = weights[f"{level}_{match_str}"]

In [18]:
def extract_hyponyms_meronym(hypernymy_dict, hypernym, kind="hyponyms"):
    nym = []
    reverse_map = hypernymy_dict[hypernym]['rev_map']
    extract_from = []
    if "direct" in reverse_map:
        extract_from.append("direct")
    if "l1" in reverse_map:
        extract_from.append("l1")
    
    for relation in extract_from:
        num_relations = len(reverse_map[relation])
        for i in range(num_relations):
            nym += reverse_map[relation][i][kind]
            
    return list(set(nym))
            

In [None]:
# Generate HHM flag
if generate_hypernymy_flag:
    all_hypernyms = []
    all_hhm_map = []
    all_hyponyms = []
    all_meronyms = []

    all_hypernyms_from_corpus = []
    all_hyponyms_from_corpus = []
    all_meronyms_from_corpus = []
    num_words = len(sense_group_dataframe)
    for _, row in tqdm(sense_group_dataframe[["sense_group_name", "sense_group_num", "related_senses"]].iterrows(), total=num_words):
        word, idx, r_senses = row

        hypernymy_dict = defaultdict(lambda: defaultdict(dict))
        
        generate_hypernymy(word, idx, hypernymy_dict, level="direct")
        if type(r_senses) == str:
            r_senses_list = ast.literal_eval(r_senses)
        else:
            r_senses_list = r_senses
        for w in r_senses_list:
            related_word, related_idx = str(w).split("#")
            if related_word in corpus_data:
                if len(related_word.strip())>0:
                    generate_hypernymy(related_word, related_idx, hypernymy_dict)
        hypernyms = []
        hyponyms = []
        meronyms = []   
        hypernyms_from_corpus = []
        hyponyms_from_corpus = []
        meronyms_from_corpus = []
        hypernymy_dict_sorted = {}
        if len(hypernymy_dict)>0:
            hypernymy_dict_sorted = {k: dict(v) for k, v in sorted(hypernymy_dict.items(), key=lambda item: item[1]['sum_weight'], reverse=True)}

            hypernyms = list(hypernymy_dict_sorted.keys())
            for hypernym in hypernyms:
                extracted_hyponyms = extract_hyponyms_meronym(hypernymy_dict_sorted, hypernym, kind="hyponyms")
                extracted_meronyms = extract_hyponyms_meronym(hypernymy_dict_sorted, hypernym, kind="meronyms")
                hyponyms.append(extracted_hyponyms)
                meronyms.append(extracted_meronyms)
                
                if hypernym.replace("_", " ") in corpus_data:
                    hypernyms_from_corpus.append(hypernym)
                    extracted_hyponyms_from_corpus = [hypo for hypo in extracted_hyponyms if hypo.replace("_", " ") in corpus_data]
                    extracted_meronyms_from_corpus = [mero for mero in extracted_meronyms if mero.replace("_", " ") in corpus_data]
                    hyponyms_from_corpus.append(extracted_hyponyms)
                    meronyms_from_corpus.append(extracted_meronyms)

        all_hypernyms.append(hypernyms)
        all_hyponyms.append(hyponyms)
        all_meronyms.append(meronyms)
        all_hypernyms_from_corpus.append(hypernyms_from_corpus)
        all_hyponyms_from_corpus.append(hyponyms_from_corpus)
        all_meronyms_from_corpus.append(meronyms_from_corpus)
        all_hhm_map.append(hypernymy_dict_sorted)

  2%|▏         | 17666/1043372 [14:49:33<1320:50:50,  4.64s/it]

In [None]:
# Update HHM data with mappings
if generate_hypernymy_flag:
    sense_group_dataframe["hhm_map"] = all_hhm_map
    sense_group_dataframe["hypernym"] = all_hypernyms
    sense_group_dataframe["hyponym"] = all_hyponyms
    sense_group_dataframe["meronym"] = all_meronyms
    sense_group_dataframe["hypernym_from_corpus"] = all_hypernyms_from_corpus
    sense_group_dataframe["hyponym_from_corpus"] = all_hyponyms_from_corpus
    sense_group_dataframe["meronym_from_corpus"] = all_meronyms_from_corpus
    sense_group_dataframe.to_csv(hhm_mappings_file, index=False)
else:
    print("Neither generate hypernymy flag, nor hhm mappings file is present. Set the appropriate flags to proceed further")
    
hhm_mappings_df = pd.read_csv(hhm_mappings_file)

In [None]:
# hhm_mappings_file = os.path.join("hhm_mappings.csv")
# sense_group_dataframe = pd.read_csv(hhm_mappings_file)

In [None]:
hhm_mappings_df.head()

--------

#### Hypernymy Graph

Helper functions - 
* extract_hypo_mero_map: Extracts mappings of hypernym-hyponym and meronym for selected word. Set the *only_from_corpus* flag True, to extract and maintain only those words that exist in the corpus.
* generate_plot_indices - Generates plot indices for nodes and edges to be used while plotting.
* make_annotations - Makes annotations to the plot.
* generate_plot - Main function that uses the last 3 helper functions to generate plot for hhm mappings and shows them.

**NOTE** - If the plot does not show up in the ouput, restart kernel and clear output, and rerun the cells. This is a known issue of plotly with IPython.

In [None]:
def extract_hypo_mero_map(word, hhm_mappings_df, only_from_corpus=True):
    subset_df = hhm_mappings_df[hhm_mappings_df["sense_group_name"]==word]
    
    rows_to_consider = ["hypernym", "hyponym", "meronym"]
    if only_from_corpus:
        rows_to_consider = ["hypernym_from_corpus", "hyponym_from_corpus", "meronym_from_corpus"]
    
    hypo_mero_map = {}
    for _, row in subset_df[rows_to_consider].iterrows():
        hyper_str, hypo_str, mero_str = row
        hyper_list = ast.literal_eval(hyper_str)
        hypo_list = ast.literal_eval(hypo_str)
        mero_list = ast.literal_eval(mero_str)
        
        for hypernym, hyponyms, meronyms in zip(hyper_list, hypo_list, mero_list):
            
            if hypernym in hypo_mero_map:
                hypo_mero_map[hypernym]["hyponyms"] =  list(set(hypo_mero_map[hypernym]["hyponyms"] + hyponyms))
                hypo_mero_map[hypernym]["meronyms"] =  list(set(hypo_mero_map[hypernym]["meronyms"] + meronyms))
            else:
                hypo_mero_map[hypernym] = {}
                hypo_mero_map[hypernym]["hyponyms"] =  list(set(hyponyms))
                hypo_mero_map[hypernym]["meronyms"] =  list(set(meronyms))
    
    if len(hypo_mero_map) == 0:
        print(f"Could not find any hypernyms for {word}. Try changing the only from corpus flag. Or try with another word.")
        sys.exit(0)
    return hypo_mero_map

In [None]:
def generate_plot_indices(word, hypo_mero_map, top_n=3):   
    max_groups = min(top_n, len(hypo_mero_map))
    
    hyper_start = 0
    word_y = 4
    word_x = math.ceil(max_groups/2) if max_groups>1 else 0
    hyper_y = 6
    hyper_x = 0
    hypo_x = 0
    hypo_y = 2
    mero_x = 0
    mero_y = 0
    label_x = -2
    
    position = {
        "Hypernyms ->": [label_x, hyper_y],
        "Word ->": [label_x, word_y],
        "Hyponyms ->": [label_x, hypo_y],
        "Meronyms ->": [label_x, mero_y],
        word: [word_x, word_y]
    }
    edges = []
    
    keys = list(hypo_mero_map.keys())[:max_groups]
    for hyper in keys:
        hypo = hypo_mero_map[hyper]['hyponyms']
        mero = hypo_mero_map[hyper]['meronyms']
        
        for i in range(len(hypo)):
            if i!=0 and i%3==0:
                hypo.insert(i, "__placeholder__")

        for i in range(len(mero)):
            if i!=0 and i%3==0:
                mero.insert(i, "__placeholder__")
        
        if hyper==word:
            hyper += "_hypernym"
        
        position[hyper] = [hyper_x, hyper_y]
        edges.append((word, hyper))
        hyper_x +=2

        hypo_group = ', '.join(hypo)
        position[hypo_group] = [hypo_x, hypo_y]
        edges.append((word, hypo_group))
        hypo_x +=2
        
        if len(mero)>0:
            mero_group = ', '.join(mero)
            position[mero_group] = [mero_x, mero_y]
            edges.append((hypo_group, mero_group))
            mero_x +=2
        
    return position, edges



In [None]:
def make_annotations(pos, text, font_size=10, font_color='rgb(0,0,0)'):
    L=len(pos)
    if len(text)!=L:
        raise ValueError('The lists pos and text must have the same len')
    annotations = []
    labels = [l.replace(", __placeholder__,","<br>") for l in text]
    for n, k in enumerate(list(pos.keys())):
        annotations.append(
            dict(
                text=labels[n], # or replace labels with a different list for the text within the circle
                x=pos[k][0], y=pos[k][1],
                xref='x1', yref='y1',
                font=dict(color=font_color, size=font_size),
                showarrow=False)
        )
    return annotations


In [None]:
def save_plot(word, fig, save_format, top_n):
    if not save_format in ["svg", "pdf", "html", "png", "jpg"]:
        print(f"Unsupported file format {save_format}. Could not save to file")
        sys.exit(0)
    out_dir = os.path.join(outputs_directory, save_format)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    output_filepath = os.path.join(outputs_directory, save_format, f"{word}.{save_format}")
    
    if save_format in ["svg", "pdf", "png", "jpg"]:
        write_image(fig, output_filepath)
    else:
        write_html(fig, output_filepath)

In [None]:
def generate_plot(word, hhm_mappings_df, only_in_corpus=False, top_n=3, save_format="svg"):
    hypo_mero_map = extract_hypo_mero_map(word, hhm_mappings_df, only_from_corpus=only_in_corpus)
    position, edges = generate_plot_indices(word, hypo_mero_map, top_n)
    
    labels = list(position.keys())
    hover_labels = [l.replace(", __placeholder__", "") for l in labels]
    
    Xn = [position[k][0] for k in labels]
    Yn = [position[k][1] for k in labels]

    Xe = []
    Ye = []
    for edge in edges:
        Xe+=[position[edge[0]][0],position[edge[1]][0], None]
        Ye+=[position[edge[0]][1],position[edge[1]][1], None]
        
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=Xe,
                       y=Ye,
                       mode='lines',
                       line=dict(color='rgb(210,210,210)', width=1),
                       hoverinfo='none'
                       ))
    fig.add_trace(go.Scatter(y=Yn,
                      x=Xn,
                      mode='markers',
                      marker=dict(symbol='circle-dot',
                                    size=0.0001,
                                    color='#6175c1',    #'#DB4551',
                                    line=dict(color='rgb(50,50,50)', width=1)
                                    ),
                      text=hover_labels,
                      hoverinfo='text',
                      opacity=0.8
                      ))
    axis = dict(showline=False, # hide axis line, grid, ticklabels and  title
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            )

    fig.update_layout(title= f'Hypernym-Hyponym-Meronym Tree - {word}',
                  annotations=make_annotations(position, labels),
                  font_size=12,
                  showlegend=False,
                  xaxis=axis,
                  yaxis=axis,
                  margin=dict(l=40, r=40, b=85, t=100),
                  hovermode='closest',
                  plot_bgcolor='rgb(248,248,248)'
                  )
    print("Save plot to file")
    save_plot(word, fig, save_format, top_n)
    fig.show()


Change the word to one of your choice and re-run to generate graph for that word.
Arguments - 
* *word* - The word for which HHM mappings are to be extracted.
* *hhm_mappings_df* - HHM mappings dataframe created in the previous step.
* *only_in_corpus* - Flag to be set if the output graph should only show the words that are there in the corpus. This will limit the hypernym, hyponyms and meronyms shown for the focus word.
* *top_n* - Top *N* groups to be plotted. Default 3. If the number of HHM groups are less than top_n, then all those groups will be shown.

In [None]:
word = "slices"
generate_plot(word, hhm_mappings_df, only_in_corpus=False, top_n=3)