In [490]:
import re
import spacy 
import enchant  
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from tqdm import tqdm
import pandas as pd

# Ensure you have the wordnet data
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")
d = enchant.Dict("en_US")

[nltk_data] Downloading package wordnet to /home/diego/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [522]:
def get_hyponyms(synset):
    hyponyms = set(synset.hyponyms())
    for hyponym in list(hyponyms):
        hyponyms |= get_hyponyms(hyponym)
    return hyponyms

def get_datapoint(category, synset):
    identifier = synset.name()
    name = identifier.split('.')[0].replace('_', ' ')
    description = f'{name}: {synset.definition()}'
    in_dict = all([d.check(w) for w in name.split()])
    datapoint = (category, identifier, name, description, in_dict)
    return datapoint

synsets = {

    'animals': [
        'placental.n.01', # mammals
        'amphibian.n.03', # amphibians
        'bird.n.01', # birds
        'fish.n.01', # fish
        'reptile.n.01' # reptiles
    ],

    # people
    'people': ['person.n.01'],

    # body parts
    'body parts': ['external_body_part.n.01'],

    # food
    'food': ['food.n.02'],

    # plants
    'plants': ['plant.n.02'],

    # nature
    'natural places': [
        'body_of_water.n.01',
        'geological_formation.n.01',
        'land.n.04'
    ],

    # man-made places
    'manmade places': [
        'building.n.01',
        'room.n.01',
        'way.n.06',
        'facility.n.01'
    ],

    # objects
    'objects': [
        'vehicle.n.01',
        'commodity.n.01',
        'instrumentality.n.03',
        'plaything.n.01',
        'article.n.02'
    ],

    # text
    'text': [
        'publication.n.01',
        'sign.n.02',
        'correspondence.n.01',
        'written_record.n.01'
    ],
}

In [523]:
datapoints = []

for cat, synset_names in tqdm(synsets.items()):

    for synset_name in synset_names:
    
        synset = wn.synset(synset_name)
        hyponyms = get_hyponyms(synset)
        hyponyms.add(synset)
        
        for hyponym in hyponyms:
            datapoint = get_datapoint(cat, hyponym)
            datapoints.append(datapoint)

df = pd.DataFrame(datapoints, columns=['category', 'identifier', 'name', 'description', 'in_dict'])

100%|██████████| 9/9 [00:00<00:00, 31.06it/s]


In [528]:
# show 20 random rows
df_in_dict = df[df.in_dict == True]

df_in_dict.to_csv('wordnet.csv', index=False)

In [532]:
df_in_dict = df_in_dict.sort_values(by='name').reset_index(drop=True)
df_in_dict.to_csv('wordnet.csv', index=False)

In [518]:
# get all rows where the word "restaurant" is in the description

df_in_dict[df_in_dict.description.str.contains('text')]


Unnamed: 0,category,identifier,name,description,in_dict
6051,people,mercer.n.01,mercer,mercer: a dealer in textiles (especially silks),True
6706,people,redact.n.01,redact,redact: someone who puts text into appropriate...,True
8090,people,decoder.n.01,decoder,decoder: the kind of intellectual who converts...,True
8312,people,tagger.n.01,tagger,tagger: someone who assigns labels to the gram...,True
8484,people,annotator.n.01,annotator,annotator: a commentator who writes notes to a...,True
8927,people,editor.n.01,editor,editor: a person responsible for the editorial...,True
9193,people,shearer.n.02,shearer,shearer: a workman who uses shears to cut leat...,True
10341,food,bologna.n.02,bologna,bologna: large smooth-textured smoked sausage ...,True
10366,food,milt.n.01,milt,milt: fish sperm or sperm-filled reproductive ...,True
10398,food,breadfruit.n.02,breadfruit,breadfruit: a large round seedless or seeded f...,True


In [436]:
def get_hyponyms(synset):
    hyponyms = set(synset.hyponyms())
    for hyponym in list(hyponyms):
        hyponyms |= get_hyponyms(hyponym)
    return hyponyms

word = 'electronic_text.n.01'
if '.' in word:
    synset = wn.synset(word)
    print(f'Description: {synset.definition()}')
    print(f'Hypernyms: {synset.hypernyms()} \n')

    print('Hyponyms:')
    hyponyms = get_hyponyms(synset)
    for hyponym in hyponyms:
        print(hyponym.name(), ': ', hyponym.definition())
else:
    synsets = np.array(wn.synsets(word, pos='n'))
    for synset in synsets:
        print('def', synset, synset.definition())
        print('hypo', synset, synset.hyponyms())
        print('hyper', synset, synset.hypernyms())

Description: text that is in a form that computer can store or display on a computer screen
Hypernyms: [Synset('text.n.01')] 

Hyponyms:
machine-displayable_text.n.01 :  electronic text that is stored and used in the form of a digital image
machine-readable_text.n.01 :  electronic text that is stored as strings of characters and that can be displayed in a variety of formats
hypertext.n.01 :  machine-readable text that is not sequential but is organized so that related items of information are connected; --Ted Nelson
