In [2]:
!pip freeze > requirements.txt

In [26]:
import spacy
import os
import pandas as pd
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 
from collections import defaultdict

###### Constants

In [2]:
window = 15

In [4]:
data_directory = os.path.join(os.getcwd(), "data")
corpus_fpath = os.path.join(data_directory, "corpus.txt")
out_file = os.path.join(data_directory, "noun_context_corpus.txt")

In [5]:
with open(corpus_fpath, "r") as f:
    corpus_data = f.readlines()


In [6]:
with open(out_file, "w") as f:
    for txt_line in corpus_data:
        nlp_data = nlp(txt_line.replace("\n", ""))
        max_tokens = len(nlp_data)
        for i, tok in enumerate(nlp_data):
            if "NN" in tok.tag_:
                start = max(0, i-window)
                end = min(i+window, max_tokens)
                left_context = [t.text for t in nlp_data[start:i]]
                right_context = [t.text for t in nlp_data[i:end]]
                noun_in_context = " ".join(left_context + right_context)
                f.write(f"{tok.text} - {noun_in_context}\n")

In [7]:
nlp_data[2]

I

In [8]:
def prepare_corpus_data_with_context(corpus_filepath, window=5, force_update=False):
    nlp = spacy.load('en')
    with open(corpus_fpath, "r") as f:
        corpus_data = f.readlines()
    out_file = os.path.join(os.path.dirname(corpus_fpath), f"window_{window}_context_corpus.txt")
    contextual_data = []
    if os.path.exists(out_file) and (not force_update):
        print("Found preexisting file. Loading context data from file")
        with open(out_file, "r") as f:
            contextual_data = f.readlines()
    else:
        print("No pre created corpus found, or force update flag is true. Generating contextual data from corpus")
        with open(out_file, "w") as f:
            for txt_line in corpus_data:
                nlp_data = nlp(txt_line.replace("\n", ""))
                max_tokens = len(nlp_data)
                for i, tok in enumerate(nlp_data):
                    if "NN" in tok.tag_:
                        start = max(0, i-window)
                        end = min(i+window, max_tokens)
                        left_context = [t.text for t in nlp_data[start:i]] + [f"<{tok.text}>"]
                        right_context = [t.text for t in nlp_data[i+1:end]]
                        noun_in_context = f"<{tok.text}> - {' '.join(left_context + right_context)}"
                        contextual_data.append(noun_in_context)
                        f.write(noun_in_context + "\n")
    return contextual_data


In [9]:
contextual_data = prepare_corpus_data_with_context(corpus_fpath, window, force_update=True)

No pre created corpus found, or force update flag is true. Generating contextual data from corpus


In [10]:
contextual_data

['<english> - can not stop listening to what have the <english> by making my saturday hangover that little bit brighter',
 '<saturday> - can not stop listening to what have the english by making my <saturday> hangover that little bit brighter',
 '<bit> - not stop listening to what have the english by making my saturday hangover that little <bit> brighter',
 '<one> - the <one> there is amazing had such a laugh when I went the other week',
 '<laugh> - the one there is amazing had such a <laugh> when I went the other week',
 '<week> - the one there is amazing had such a laugh when I went the other <week>',
 '<ohgod> - <ohgod>',
 '<bentley> - <bentley> fuck off haha d I have dainty feet',
 '<dainty> - bentley fuck off haha d I have <dainty> feet',
 '<feet> - bentley fuck off haha d I have dainty <feet>',
 '<drinking> - alright well let you off just had an impromptu <drinking> night myselfhehe Smile',
 '<night> - alright well let you off just had an impromptu drinking <night> myselfhehe Smi

In [32]:
def get_sense_group_from_corpus(context_data, wsd_model, sense_vectors, output_file):
    output = {
        "word": [],
        "context": [],
        "sense_id": [],
        "sense_group_name": [],
        "sense_group_num": [],
        "sense_probability": [],
        "related_senses": []
    }
    
    for row in context_data:
        word, ctx = row.split(' - ')
        sense_id, sense_probs = wsd_model.disambiguate(corpus_data, test_word)
        sense_probability = max(sense_probs)
        sense_group_name, sense_group_num = sense_id.split("#")
        related_senses = [r_senseid for r_senseid,_ in sense_vectors.wv.most_similar(sense_id)]
        related_senses_l2 = [r_senseid for r_senseid,_ in sense_vectors.wv.most_similar(related_sense) for related_sense in related_senses]
        output["word"].append(word)
        output["context"].append(ctx)
        output["sense_id"].append(sense_id)
        output["sense_group_name"].append(sense_group_name)
        output["sense_group_num"].append(sense_group_num)
        output["sense_probability"].append(sense_probability)
        output["related_senses"].append(related_senses+related_senses_l2)
    
    output_df = pd.DataFrame(output)
    output_df.to_csv(output_file, index=False)
    
    return output_df

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/copa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nlp = spacy.load("en")
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [5]:
token = nlp('prices')[0]

In [16]:
token._.wordnet.synsets()[0].hyponyms()

[Synset('assessment.n.03'),
 Synset('average_cost.n.01'),
 Synset('expensiveness.n.01'),
 Synset('inexpensiveness.n.01'),
 Synset('marginal_cost.n.01')]

In [22]:
token._.wordnet.synsets()[0].part_meronyms()  

[]

#### HHM Algo - 

###### For hypernym extraction
1. Create empty dict of hypernyms  -   
    ```
    {
	"word_num": [{
		"hypernym_1": {
			"rev_map": {
				"direct": [{
					"sense_id": synset_object
				}],
				"L1": [{}],
				"L2": [{}]
			},
			"weight": num
		},
		"hypernym_2": {}
	}]}
    ```
- For each word in data 
    - extract hypernym from synset mappings for word#number - Highest weight(8)
    - extract hypernym for all other synset mappings(5)
    - For each related word level 1 
        - Extract hypernym for word#number - weight(3)
        - Extract hypernym for all others - weight(2)
    - For each related word level 2
        - Extract hypernym for word#number - weight(1)
        - Extract hypernym for all others - weight(0.5)
    - Keep adding the hypernym weight to the right key. 
    - Identify the right hypernym - One with highest weight and present in data, else next highest, and so on.
    - maintain a reverse map for each hypernym(for hyponymy calculation)

##### For hyponymy
1. Initialize dict for hyponyms - ```{word: [hyponym1, hyponym2, ...]}```
- Use rev map created in previous step to identify hyponyms of the noun under consideration.
- For each entry in rev_map, get hyponyms from the synset object. Maintain the direct, L1, L2 structure.
- Extract and preserve hyponyms that exist in the data

##### For meronymy
1. Use hypernymy-hyponymy relation to create a tree structure. 
2. For each level of the tree, identify meronyms for each node, and preserve common meronyms, that are also present in the data.

#### Hypernymy extractor

In [43]:
weights = {
    "direct_match": 8,
    "direct_nomatch": 5,
    "l1_match": 3,
    "l1_nomatch": 2,
    "l2_match": 1,
    "l2_nomatch": 0.5,
}

In [2]:
import spacy
import os
import pandas as pd
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 

nlp = spacy.load("en")
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')


df = pd.read_csv("corpus_sense_mapping.csv")
df.head()

Unnamed: 0,word,context,sense_id,sense_group_name,sense_group_num,sense_probability,related_senses
0,<english>,can not stop listening to what have the <engli...,english#6,english,6,0.962645,"['English#6', 'ENGLISH#6', 'bengali#1', 'Sinha..."
1,<saturday>,can not stop listening to what have the englis...,saturday#1,saturday,1,0.985751,"['Saturday#2', 'SATURDAY#2', 'SUNDAY#3', 'sund..."
2,<bit>,not stop listening to what have the english by...,bit#6,bit,6,0.98774,"['Bit#5', 'BIT#5', 'stuff#3', 'Stuff#2', 'Real..."
3,<one>,the <one> there is amazing had such a laugh wh...,one#3,one,3,0.969355,"['One#3', 'ONE#3', 'List#10', 'Top#8', 'TOP#9'..."
4,<laugh>,the one there is amazing had such a <laugh> wh...,laugh#1,laugh,1,0.914474,"['LAUGH#1', 'Laugh#2', 'giggle#1', 'Giggle#1',..."


In [14]:
hypernymy_map = {}
for _, row in df[["sense_group_name", "sense_group_num", "related_senses"]].iterrows():
    word, idx, r_senses = row
    print(f"{word} - {r_senses}")

english - ['English#6', 'ENGLISH#6', 'bengali#1', 'Sinhala#3', 'konkani#1', 'kerala#4', 'Oriya#2', 'telugu#2', 'Assamese#1', 'tamil#4', 'english#6', 'ENGLISH#6', 'bengali#1', 'Sinhala#3', 'konkani#1', 'kerala#4', 'Oriya#2', 'telugu#2', 'Assamese#1', 'tamil#4', 'english#6', 'English#6', 'bengali#1', 'Sinhala#3', 'konkani#1', 'kerala#4', 'Oriya#2', 'telugu#2', 'Assamese#1', 'tamil#4', 'konkani#1', 'punjabi#1', 'Odia#1', 'Assamese#1', 'Oriya#2', 'English#6', 'english#6', 'ENGLISH#6', 'hindi#2', 'Sinhala#3', 'vernacularization#2', 'konkani#1', 'Assamese#1', 'Oriya#2', 'tamil#4', 'Odia#1', 'bengali#1', 'ENGLISH#6', 'english#6', 'English#6', 'Odia#1', 'bengali#1', 'Oriya#2', 'Assamese#1', 'Sinhala#3', 'hindi#2', 'marathi#1', 'telugu#2', 'punjabi#1', 'vernacularization#2', 'ENGLISH#6', 'English#6', 'english#6', 'telugu#2', 'bengali#1', 'konkani#1', 'Sinhala#3', 'Oriya#2', 'malayalam#2', 'Odia#1', 'konkani#1', 'Odia#1', 'Assamese#1', 'marathi#1', 'telugu#2', 'Sinhala#3', 'hindi#2', 'bengali#1'

jacky - []
brownie - ['Brownie#3', 'hobbit#2', 'Hobbit#2', 'Gnome#2', 'GNOME#2', 'gnome#2', 'clod#4', 'yeti#3', 'Yeti#3', 'Hobgoblin#2', 'brownie#3', 'hobbit#2', 'Hobbit#2', 'Gnome#2', 'GNOME#2', 'gnome#2', 'clod#4', 'yeti#3', 'Yeti#3', 'Hobgoblin#2', 'Hobbit#2', 'GNOME#2', 'Gnome#2', 'gnome#2', 'hobgoblin#2', 'Hobgoblin#2', 'wight#1', 'Wight#1', 'kobold#1', 'Kobold#1', 'hobbit#2', 'GNOME#2', 'Gnome#2', 'gnome#2', 'hobgoblin#2', 'Hobgoblin#2', 'wight#1', 'Wight#1', 'kobold#1', 'Kobold#1', 'GNOME#2', 'gnome#2', 'hobbit#2', 'Hobbit#2', 'yeti#3', 'Yeti#3', 'leprechaun#2', 'Leprechaun#2', 'troll#1', 'Hobgoblin#2', 'Gnome#2', 'gnome#2', 'hobbit#2', 'Hobbit#2', 'yeti#3', 'Yeti#3', 'leprechaun#2', 'Leprechaun#2', 'troll#1', 'Hobgoblin#2', 'Gnome#2', 'GNOME#2', 'Hobbit#2', 'hobbit#2', 'Yeti#3', 'yeti#3', 'leprechaun#2', 'Leprechaun#2', 'nisse#2', 'Nisse#2', 'churl#2', 'Brownie#3', 'brownie#3', 'Hobbit#2', 'hobbit#2', 'wildman#2', 'Wildman#2', 'Dweller#4', 'Leprechaun#2', 'leprechaun#2', 'Yeti#

In [20]:
word = "bit"

In [73]:
test = []

hypernymy_dict = defaultdict(lambda: defaultdict(dict))
token = nlp(str(word))[0]
synsets = token._.wordnet.synsets()
for syn in synsets:
    if syn.name().split(".n.")[0] == word and int(syn.name().split(".n.")[1]) == idx:
        match_str = "match"
        print(syn)
    else:
        match_str = "nomatch"
    
    hypernym_syn = syn.hypernyms()
    if len(hypernym_syn) >0:
        hypernym = hypernym_syn[0].name().split('.')[0]
        rev_map = {f"{word}#{idx}":syn}
        if "direct" in hypernymy_dict[hypernym]["rev_map"]:
            hypernymy_dict[hypernym]["rev_map"]["direct"].append(rev_map) 
        else:
            hypernymy_dict[hypernym]["rev_map"]["direct"] = [rev_map]

        if "sum_weight" in hypernymy_dict[hypernym]:
            hypernymy_dict[hypernym]["sum_weight"] += weights[f"direct_{match_str}"]
        else:
            hypernymy_dict[hypernym]["sum_weight"] = weights[f"direct_{match_str}"]

for w in r_senses:
    token = nlp(str(w))[0]
    synsets = token._.wordnet.synsets()
    for syn in synsets:
        if syn.name().split(".n.")[0] == word and int(syn.name().split(".n.")[1]) == idx:
            match_str = "match"
            print(syn)
        else:
            match_str = "nomatch"

        hypernym_syn = syn.hypernyms()
        if len(hypernym_syn) >0:
            hypernym = hypernym_syn[0].name().split('.')[0]
            rev_map = {f"{word}#{idx}":syn}
            if "direct" in hypernymy_dict[hypernym]["rev_map"]:
                hypernymy_dict[hypernym]["rev_map"]["l1"].append(rev_map) 
            else:
                hypernymy_dict[hypernym]["rev_map"]["l1"] = [rev_map]

            if "sum_weight" in hypernymy_dict[hypernym]:
                hypernymy_dict[hypernym]["sum_weight"] += weights[f"l1_{match_str}"]
            else:
                hypernymy_dict[hypernym]["sum_weight"] = weights[f"l1_{match_str}"]
else:
    pass

Synset('bit.n.06')


In [75]:
hypernymy_dict

defaultdict(<function __main__.<lambda>()>,
            {'small_indefinite_quantity': defaultdict(dict,
                         {'rev_map': {'direct': [{'bit#6': Synset('spot.n.10')}]},
                          'sum_weight': 5}),
             'fragment': defaultdict(dict,
                         {'rev_map': {'direct': [{'bit#6': Synset('bit.n.02')},
                            {'bit#6': Synset('snatch.n.01')}]},
                          'sum_weight': 10}),
             'time': defaultdict(dict,
                         {'rev_map': {'direct': [{'bit#6': Synset('moment.n.02')}]},
                          'sum_weight': 5}),
             'case': defaultdict(dict,
                         {'rev_map': {'direct': [{'bit#6': Synset('piece.n.05')}]},
                          'sum_weight': 5}),
             'stable_gear': defaultdict(dict,
                         {'rev_map': {'direct': [{'bit#6': Synset('bit.n.05')}]},
                          'sum_weight': 5}),
             'unit_of_mea

[Synset('indefinite_quantity.n.01')]
[Synset('part.n.03')]
[Synset('time_period.n.01')]
[Synset('happening.n.01')]
[Synset('gear.n.04')]
[Synset('definite_quantity.n.01')]
[Synset('small_indefinite_quantity.n.01')]
[Synset('piece.n.06')]
[Synset('show.n.03')]
[Synset('object.n.01')]
[Synset('tool.n.01')]


In [39]:
"x" in d["a"]["b"]

False