In [2]:
!pip freeze > requirements.txt

In [1]:
import spacy
import os
import pandas as pd
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 

###### Constants

In [2]:
window = 15

In [4]:
data_directory = os.path.join(os.getcwd(), "data")
corpus_fpath = os.path.join(data_directory, "corpus.txt")
out_file = os.path.join(data_directory, "noun_context_corpus.txt")

In [5]:
with open(corpus_fpath, "r") as f:
    corpus_data = f.readlines()


In [6]:
with open(out_file, "w") as f:
    for txt_line in corpus_data:
        nlp_data = nlp(txt_line.replace("\n", ""))
        max_tokens = len(nlp_data)
        for i, tok in enumerate(nlp_data):
            if "NN" in tok.tag_:
                start = max(0, i-window)
                end = min(i+window, max_tokens)
                left_context = [t.text for t in nlp_data[start:i]]
                right_context = [t.text for t in nlp_data[i:end]]
                noun_in_context = " ".join(left_context + right_context)
                f.write(f"{tok.text} - {noun_in_context}\n")

In [7]:
nlp_data[2]

I

In [8]:
def prepare_corpus_data_with_context(corpus_filepath, window=5, force_update=False):
    nlp = spacy.load('en')
    with open(corpus_fpath, "r") as f:
        corpus_data = f.readlines()
    out_file = os.path.join(os.path.dirname(corpus_fpath), f"window_{window}_context_corpus.txt")
    contextual_data = []
    if os.path.exists(out_file) and (not force_update):
        print("Found preexisting file. Loading context data from file")
        with open(out_file, "r") as f:
            contextual_data = f.readlines()
    else:
        print("No pre created corpus found, or force update flag is true. Generating contextual data from corpus")
        with open(out_file, "w") as f:
            for txt_line in corpus_data:
                nlp_data = nlp(txt_line.replace("\n", ""))
                max_tokens = len(nlp_data)
                for i, tok in enumerate(nlp_data):
                    if "NN" in tok.tag_:
                        start = max(0, i-window)
                        end = min(i+window, max_tokens)
                        left_context = [t.text for t in nlp_data[start:i]] + [f"<{tok.text}>"]
                        right_context = [t.text for t in nlp_data[i+1:end]]
                        noun_in_context = f"<{tok.text}> - {' '.join(left_context + right_context)}"
                        contextual_data.append(noun_in_context)
                        f.write(noun_in_context + "\n")
    return contextual_data


In [9]:
contextual_data = prepare_corpus_data_with_context(corpus_fpath, window, force_update=True)

No pre created corpus found, or force update flag is true. Generating contextual data from corpus


In [10]:
contextual_data

['<english> - can not stop listening to what have the <english> by making my saturday hangover that little bit brighter',
 '<saturday> - can not stop listening to what have the english by making my <saturday> hangover that little bit brighter',
 '<bit> - not stop listening to what have the english by making my saturday hangover that little <bit> brighter',
 '<one> - the <one> there is amazing had such a laugh when I went the other week',
 '<laugh> - the one there is amazing had such a <laugh> when I went the other week',
 '<week> - the one there is amazing had such a laugh when I went the other <week>',
 '<ohgod> - <ohgod>',
 '<bentley> - <bentley> fuck off haha d I have dainty feet',
 '<dainty> - bentley fuck off haha d I have <dainty> feet',
 '<feet> - bentley fuck off haha d I have dainty <feet>',
 '<drinking> - alright well let you off just had an impromptu <drinking> night myselfhehe Smile',
 '<night> - alright well let you off just had an impromptu drinking <night> myselfhehe Smi

In [32]:
def get_sense_group_from_corpus(context_data, wsd_model, sense_vectors, output_file):
    output = {
        "word": [],
        "context": [],
        "sense_id": [],
        "sense_group_name": [],
        "sense_group_num": [],
        "sense_probability": [],
        "related_senses": []
    }
    
    for row in context_data:
        word, ctx = row.split(' - ')
        sense_id, sense_probs = wsd_model.disambiguate(corpus_data, test_word)
        sense_probability = max(sense_probs)
        sense_group_name, sense_group_num = sense_id.split("#")
        related_senses = [r_senseid for r_senseid,_ in sense_vectors.wv.most_similar(sense_id)]
        related_senses_l2 = [r_senseid for r_senseid,_ in sense_vectors.wv.most_similar(related_sense) for related_sense in related_senses]
        output["word"].append(word)
        output["context"].append(ctx)
        output["sense_id"].append(sense_id)
        output["sense_group_name"].append(sense_group_name)
        output["sense_group_num"].append(sense_group_num)
        output["sense_probability"].append(sense_probability)
        output["related_senses"].append(related_senses+related_senses_l2)
    
    output_df = pd.DataFrame(output)
    output_df.to_csv(output_file, index=False)
    
    return output_df

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/copa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nlp = spacy.load("en")
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [5]:
token = nlp('prices')[0]

In [16]:
token._.wordnet.synsets()[0].hyponyms()

[Synset('assessment.n.03'),
 Synset('average_cost.n.01'),
 Synset('expensiveness.n.01'),
 Synset('inexpensiveness.n.01'),
 Synset('marginal_cost.n.01')]

In [22]:
token._.wordnet.synsets()[0].part_meronyms()  

[]

#### Algo - 

###### For hypernym extraction
1. Create empty dict of hypernyms  -   
    ```
    {
	"word_num": [{
		"hypernym_1": {
			"rev_map": {
				"direct": [{
					"sense_id": synset_object
				}],
				"L1": [{}],
				"L2": [{}]
			},
			"weight": num
		},
		"hypernym_2": {}
	}]}
    ```
- For each word in data 
    - extract hypernym from synset mappings for word#number - Highest weight(8)
    - extract hypernym for all other synset mappings(5)
    - For each related word level 1 
        - Extract hypernym for word#number - weight(3)
        - Extract hypernym for all others - weight(2)
    - For each related word level 2
        - Extract hypernym for word#number - weight(1)
        - Extract hypernym for all others - weight(0.5)
    - Keep adding the hypernym weight to the right key. 
    - Identify the right hypernym - One with highest weight and present in data, else next highest, and so on.
    - maintain a reverse map for each hypernym(for hyponymy calculation)

##### For hyponymy
1. Initialize dict for hyponyms - ```{word: [hyponym1, hyponym2, ...]}```
- Use rev map created in previous step to identify hyponyms of the noun under consideration.
- For each entry in rev_map, get hyponyms from the synset object. Maintain the direct, L1, L2 structure.
- Extract and preserve hyponyms that exist in the data

##### For meronymy
1. Use hypernymy-hyponymy relation to create a tree structure. 
2. For each level of the tree, identify meronyms for each node, and preserve common meronyms, that are also present in the data.