In [2]:
import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
import pandas as pd

In [3]:
prepositions = [
    'of',
	'with',
	'at',
	'from',
	'into',
	'during',
	'including',
	'until',
	'against',
	'among',
	'throughout',
	'despite',
	'towards',
	'upon',
	'concerning',
	'to',
	'in',
	'for',
	'on',
	'by',
	'about',
	'like',
	'through',
	'over',
	'before',
	'between',
	'after',
	'since',
	'without',
	'under',
	'within',
	'along',
	'following',
	'across',
	'behind',
	'beyond',
	'plus',
	'except',
	'but',
	'up',
	'out',
	'around',
	'down',
	'off',
	'above',
	'near'
]

prepositions_wikipedia = [
	"aboard",
        "about",
        "above",
        "absent",
        "across",
        "after",
        "against",
        "along",
        "alongside",
        "amid",
        "amidst",
        "among",
        "amongst",
        "around",
        "as",
        "astride",
        "at",
        "atop",
        "before",
        "afore",
        "behind",
        "below",
        "beneath",
        "beside",
        "besides",
        "between",
        "beyond",
        "by",
        "circa",
        "despite",
        "down",
        "during",
        "except",
        "for",
        "from",
        "in",
        "inside",
        "into",
        "less",
        "like",
        "minus",
        "near",
        "nearer",
        "nearest",
        "notwithstanding",
        "of",
        "off",
        "on",
        "onto",
        "opposite",
        "outside",
        "over",
        "past",
        "per",
        "save",
        "since",
        "through",
        "throughout",
        "to",
        "toward",
        "towards",
        "under",
        "underneath",
        "until",
        "up",
        "upon",
        "upside",
        "versus",
        "via",
        "with",
        "within",
        "without",
        "worth",
        "according to",
        "adjacent to",
        "ahead of",
        "apart from",
        "as of",
        "as per",
        "as regards",
        "aside from",
        "astern of",
        "back to",
        "because of",
        "close to",
        "due to",
        "except for",
        "far from",
        "inside of",
        "instead of",
        "left of",
        "near to",
        "next to",
        "opposite of",
        "opposite to",
        "out from",
        "out of",
        "outside of",
        "owing to",
        "prior to",
        "pursuant to",
        "rather than",
        "regardless of",
        "right of",
        "subsequent to",
        "such as",
        "thanks to",
        "up to",
        "as far as",
        "as opposed to",
        "as soon as",
        "as well as",
        "at the behest of",
        "by means of",
        "by virtue of",
        "for the sake of",
        "in accordance with",
        "in addition to",
        "in case of",
        "in front of",
        "in lieu of",
        "in place of",
        "in point of",
        "in spite of",
        "on account of",
        "on behalf of",
        "on top of",
        "with regard to",
        "with respect to",
        "with a view to"
]

merge_prepositions = list(set(prepositions + prepositions_wikipedia))

In [4]:
def extract_relation(syns, method):
    return{
        lemma.name()
        for syn in syns
        for mero_hypo in getattr(syn, method)()
        for lemma in mero_hypo.lemmas()
    }
    
meronym_methods  = ['part_meronyms', 'member_meronyms', 'substance_meronyms']
holonym_methods  = ['part_holonyms', 'member_holonyms', 'substance_holonyms']

In [5]:
rows = []
mero_hypo_rel = {}
for prep in merge_prepositions:
    syns = wn.synsets(prep, pos=wn.ADV) + \
        wn.synsets(prep, pos=wn.ADJ) + \
        wn.synsets(prep, pos=wn.NOUN) + \
        wn.synsets(prep, pos=wn.VERB)
    
    lemmas = {lemma.name() for syn in syns for lemma in syn.lemmas()}
    antonym = [antonym_lem.name() for syn in syns for synlem in syn.lemmas() for antonym_lem in synlem.antonyms()  ]
    antonym = set(antonym)
    hypernym = {lem_hyper.name() for syn in syns for hyper in syn.hypernyms() for lem_hyper in hyper.lemmas()}
    hyponym = {lem_hypo.name() for syn in syns for hypo in syn.hyponyms() for lem_hypo in hypo.lemmas()}
    meronym = set().union(*(extract_relation(syns, m) for m in meronym_methods))
    holonym = set().union(*(extract_relation(syns, h) for h in holonym_methods))
    
    lemmas.discard(prep)
    
    if lemmas:
        synonym_str = ', '.join(sorted(lemmas))
    if antonym:
        antonym_str = ', '.join(sorted(antonym))
    if hypernym:    
        hypernym_str = ', '.join(sorted(hypernym))
    if hyponym:
        hyponym_str = ', '.join(sorted(hyponym))
    if meronym:
        meronym_str = ', '.join(sorted(meronym))
    if holonym:
        holonym_str = ', '.join(sorted(holonym))
        
    
    rows.append({
        'preposition': prep,
        'synonyms': synonym_str if lemmas else None,
        'antonyms': antonym if antonym else None,
        'hypernym': hypernym_str if hypernym else None,
        'hyponym': hyponym_str if hyponym else None,
        'meronym': meronym_str if meronym else None,
        'holonym': holonym_str if holonym else None,   
        
    })
    
df = pd.DataFrame(rows)
for col in df.columns:
    df[col] = df[col].replace(to_replace=r'[{}_]', value=' ', regex=True)
    

df.head(20)

Unnamed: 0,preposition,synonyms,antonyms,hypernym,hyponym,meronym,holonym
0,notwithstanding,"all the same, even so, however, nevertheless, ...",,,,,
1,inside of,,,,,,
2,for,,,,,,
3,around,"about, approximately, close to, just about, mo...",,,,,
4,as regards,,,,,,
5,out,"KO'd, away, come out, come out of the closet, ...",{safe},"break, bring out, disclose, discover, divulge,...","putout, strikeout",,
6,down,"Down, John L. H. Down, belt down, blue, bolt d...","{upwards, upwardly, upward, up}","ameliorate, amend, better, defeat, drink, eat,...","civilise, civilize, cultivate, duck down, educ...",,
7,right of,,,,,,
8,rather than,,,,,,
9,with respect to,,,,,,


In [6]:
df['antonyms'] = (
    df['antonyms']
      .fillna('')                                           
      .astype(str)                                          
      .str.replace(r'[{}_\']', ' ', regex=True)             
      .str.strip()                                          
      .replace({'': None})                                  
)
#sort df aplhabetically
df = df.sort_values(by='preposition', ascending=True)
df.head(20)

Unnamed: 0,preposition,synonyms,antonyms,hypernym,hyponym,meronym,holonym
61,aboard,"alongside, on base, on board",,,,,
65,about,"almost, approximately, around, astir, close to...",,,,,
79,above,"higher up, in a higher place, supra, to a high...",below,"section, subdivision",,,
60,absent,"absentminded, abstracted, lacking, missing, re...",present,"disappear, go away, vanish",,,
11,according to,,,,,,
64,across,"crossways, crosswise",,,,,
46,adjacent to,,,,,,
113,afore,,,,,,
30,after,"afterward, afterwards, later, later on, subseq...",,,,,
43,against,,,,,,


In [7]:
synset_word = wn.synonyms('crossway')

print(synset_word)
# find synset each synset_word
# for i in synset_word:
#     try:
#         print(wn.synsets(i))
#     except AttributeError:
#         print(f"AttributeError: {i} not found in WordNet")
#         continue
        

# print(synset)

[['carrefour', 'crossing', 'crossroad', 'intersection']]


In [8]:
syns = wn.synsets('crosswise')[0].lemmas()[0].synset().definition()
print(syns)

lying or extending across the length of a thing or in a cross direction


### Compile definition from Merriam Webster Dictionary

In [9]:
import requests
import re
from bs4 import BeautifulSoup
import glob



In [10]:
import os

def save_html_content(preposition, path_to_save = './scrap_meriam_webster/'):
    # encode spaces for URL and create filename
    prep_encoded = preposition.replace(' ', '%20')
    url = f"https://www.merriam-webster.com/dictionary/{prep_encoded}"
    
    resp = requests.get(url)
    if resp.status_code == 200:
        filename = f"{prep_encoded.replace('%20', '_')}.html"
        filepath = os.path.join(path_to_save, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(resp.text)
        print(f"Saved HTML for '{preposition}' → {filepath}")
    else:
        print(f"Failed to fetch '{preposition}' (status {resp.status_code})")

In [11]:
# for prep in merge_prepositions:
#     save_html_content(prep)

In [12]:
# Define the URL and parameters
# url = "https://www.merriam-webster.com/dictionary/"

# def get_merriam_webster_definition(word):
#     word = word.replace(" ", "%20")
#     # Make a GET request to the URL 
#     response = requests.get(url + word)
#     if response.status_code != 200:
#         return None
#     # parse it here
#     soup = BeautifulSoup(response.text, 'html.parser')
#     return get_first_definition(soup)

def load_saved_definitions(html_dir):
    definitions = {}
    # find  .html in that folder
    for filepath in glob.glob(os.path.join(html_dir, '*.html')):
        #infer word from the filename, e.g. 'until.html' -> 'until'
        name = os.path.splitext(os.path.basename(filepath))[0]
        word = name.replace('_', ' ')
        # read & parse
        with open(filepath, 'r', encoding='utf-8') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        
        # run your existing scraper logic
        definition = get_first_definition(soup, word)
        definitions[word] = definition
    
    return definitions


def get_first_definition(soup, preposition_name=None):
    phrase_span = soup.find('span', class_='drp', string=preposition_name)
    if phrase_span:
        # the definition is in the very next <div class="vg"> block
        vg = phrase_span.find_next_sibling('div', class_='vg')
        if vg:
            dt = vg.find('span', class_='dtText')
            if dt:
                # pull out the raw text, strip leading “: ”, etc.
                raw = dt.get_text(separator=' ', strip=True)
                cleaned = re.sub(r'^\s*:\s*', '', raw)
                # split off any trailing “: ” left-over
                return ' '.join(re.findall(r'\([^)]+\)|[A-Za-z]+', cleaned))

    pos_link = soup.find('a', href=re.compile(r'preposition'))
    # fall back to any unText spans
    if pos_link is None:
        unspans = soup.find_all('span', class_='unText')
        if unspans:
            text = ''.join(unspans[0].strings).strip()
            # print(text)
            return ' '.join(re.findall(r'\([^)]+\)|[A-Za-z]+', text))

    # preposition link’s entry container
    container = (
        pos_link.find_parent('div', id=re.compile(r'dictionary-entry-[0-9]+'))
        if pos_link else
        soup.find('div', id=re.compile(r'dictionary-entry-\d+'))
    )
    # grab the first <span class="dtText">
    dt_spans = container.find_all('span', class_='dtText')
    for span in dt_spans:
        # print(span.find(class_='mw_t_bc'))
        
        
        # if span.find('strong') or span.find('span', class_='text-uppercase'):
        #     continue

        # otherwise this is our “clean” definition
        # raw = span.get_text(separator=' ', strip=True)
        # raw = re.sub(r'^\s*:\s*', '', raw)           # strip the leading colon
        # words = re.findall(r'\([^)]+\)|[A-Za-z]+', raw)
        # return ' '.join(words)
        texts = [t.strip() 
         for t in span.find_all(string=True, recursive=False) 
         if t.strip()]
        result = ", ".join(texts)
        if len(result) >= 2:
            return result

    # if we got here, no clean dtText – fall back to <span class='unText'>…
    unspans = container.find_all('span', class_='unText')
    if unspans:
        raw = ''.join(unspans[0].strings).strip()
        return ' '.join(re.findall(r'\([^)]+\)|[A-Za-z]+', raw))

    # last‐ditch resort
    dt = container.find('span', class_='dtText')
    if dt:
        link = dt.find('a')
        if link:
            # e.g. “subsequent to”
            return link.get_text(strip=True)

    
    return None


def get_preposition_definitions(prepositions):
    definitions = {}
    for prep in prepositions:
        definition = get_preposition_definitions(prep)
        if definition:
            definitions[prep] = definition
        else:
            definitions[prep] = None
    return definitions

In [13]:
# df_def = get_preposition_definitions(merge_prepositions)
# df_def = pd.DataFrame(list(df_def.items()), columns=['preposition', 'definition'])

In [14]:
html_folder = './scrap_meriam_webster'

defs = load_saved_definitions(html_folder)
df_def_local = pd.DataFrame(
    list(defs.items()),
    columns=['preposition', 'definition']
)

In [15]:
# print isna values
# df_def_local[df_def_local['definition'].isna()]
df_def_local.columns

Index(['preposition', 'definition'], dtype='object')

In [16]:
df_def_local.sort_values(by='preposition', ascending=True)
df_def_local[df_def_local['preposition'].str.match('as')]

Unnamed: 0,preposition,definition
7,astride,on or above and with one leg on each side of
17,as opposed to,used to refer to something that is different f...
23,aside from,in addition to
26,as,"in the capacity, character, condition, or role of"
45,as per,in accordance with
75,as far as,with regard to
77,as soon as,immediately at or shortly after the time that
91,as well as,in addition to
96,as regards,"in regard to, with respect to"
107,as of,",, ,"


In [17]:
pd.set_option('display.max_colwidth', None)

df_def_local[df_def_local['preposition'] == 'astern of']
# df_def_local[df_def_local['preposition'].str.contains('astern of')]

Unnamed: 0,preposition,definition


In [18]:
# add definition of 'as of'
df_def_local.loc[df_def_local['preposition'] == 'as of', 'definition'] = 'used to indicate a time or date at which something begins or ends'
df_def_local[df_def_local['preposition'] == 'as of']

Unnamed: 0,preposition,definition
107,as of,used to indicate a time or date at which something begins or ends


In [19]:
#    save_html_content(prep)
# add from populated df
prep_to_append = []
for prep in df_def_local['preposition']:
    # check if prep is in df['preposition']
    if prep not in df['preposition'].values:
        prep_to_append.append(prep)
            
for prep in prep_to_append:
    save_html_content(prep, path_to_save = './scrap_meriam_webster/additional_wiki_pop/')

dfs_add = load_saved_definitions('./scrap_meriam_webster/additional_wiki_pop/')
    

In [48]:
df_def_local[df_def_local['preposition'] == 'with']

Unnamed: 0,preposition,definition
9,with,in opposition to


In [21]:
# test with one word
urrll = 'https://www.merriam-webster.com/dictionary/out'
rqg = requests.get(urrll)
sup = BeautifulSoup(rqg.text, 'html.parser')
get_first_definition(sup, 'out')

'used as a function word to indicate an outward movement'

In [22]:
# df_def_local.to_csv('./dictionaries/preposition_definitions.csv', index=False)

['from', 'one', 'side', 'the', 'opposite', 'side', 'over', 'through']

In [23]:
syns_n = wn.synsets('upward')
all_name = [lemma.name() for syn in syns_n for lemma in syn.lemmas()]
print(syns_n[0].examples())
# print(syns_n)

# for item in syns_n:
#     if item:
#         head = item.lemmas()[0].name()
#         hypos = item.synonyms()
#         print(hypos)
        
#     if hypos:
#         # take the first hyponym synset, then its first lemma
#         hypo_lemma = hypos[0].lemmas()[0].name()
#         hypos_def = hypos[0].definition()
#     else:
#         hypo_lemma = None
#         hypos_def = None


#     print(f"{head} : {hypo_lemma}, {hypos_def}")

['the cards were face upward', 'an upward stroke of the pen']


### Get semantic relations of the prepositions to define its spatial status

In [25]:
prep_list = "dictionaries/preposition_wordnet_wiki_pop.csv"
df = pd.read_csv(prep_list, index_col=0)
df.head()

Unnamed: 0,preposition,synonyms,antonyms,hypernym,hyponym,meronym,holonym
102,aboard,"alongside, on base, on board",,,,,
33,about,"almost, approximately, around, astir, close to, just about, more or less, most, near, nearly, nigh, or so, roughly, some, virtually, well-nigh",,,,,
41,above,"higher up, in a higher place, supra, to a higher place",below,"section, subdivision",,,
95,absent,"absentminded, abstracted, lacking, missing, remove, scatty, wanting",present,"disappear, go away, vanish",,,
122,according to,,,,,,


In [None]:
# fetch definition with scraping playwright of each preposition from 

In [26]:
word_synsets = wn.synsets('following')
antos = [antonym_lem.name() for syn in word_synsets for synlem in syn.lemmas() for antonym_lem in synlem.antonyms()  ]

unique_dict = {}
for word in antos:
    unique_dict[word] = unique_dict.get(word, 0) + 1

antos_set = set(antos)
print(antos_set)

{'precede', 'predate', 'leading'}


In [27]:
# antonym lemmas
for syn in wn.synsets('highland'):
    for lemma in syn.lemmas():
        ants = lemma.antonyms()
        if ants:
            first_ant = ants[0]
            print(first_ant.synset().definition(), ":", first_ant)

low level country : Lemma('lowland.n.01.lowland')
of relatively low or level country : Lemma('lowland.a.01.lowland')


### Search tokes of lexicon entry and all its definition

In [28]:
select = df[df['preposition'].str.contains('near', na=False)]
arr=[]
for idx, item in enumerate(select['hyponym']):
    if pd.notna(item):
        # arr.append(item)
        select_tokens = [token.strip() for token in item.split(',')]

select_tokens
for i in select_tokens:
    if len(wn.synsets(i)) != 0:
        hypernym_highland = wn.synsets(i)[0]
        print(i, ":", hypernym_highland.definition())

close : the temporal end; the concluding time
crowd : a large number of things or people considered together
push : the act of applying force in order to move something away


In [29]:
items = df[df['preposition'].str.contains('near', na=False)]['hyponym']
for i in items:
    if i:
        if pd.notna(i):
            select_tokens = [token.strip() for token in i.split(',')]
            for j in select_tokens:
                if len(wn.synsets(j)) != 0:
                    token_def = wn.synsets(j)[0]
                    print(j, ":", token_def.lemmas()[0], ";", token_def.definition())

                else:
                    print(j, ":", "No definition found")
    

bear down on : No definition found
bear down upon : No definition found
close : Lemma('stopping_point.n.01.stopping_point') ; the temporal end; the concluding time
crowd : Lemma('crowd.n.01.crowd') ; a large number of things or people considered together
drive up : No definition found
edge in : No definition found
edge up : No definition found
push : Lemma('push.n.01.push') ; the act of applying force in order to move something away


In [30]:
synonym = wn.synsets('near')
#flatten synonym list
synonym = [lemma.name() for syn in synonym for lemma in syn.lemmas()]
#remove duplicates
synonym = set(synonym)
#remove the word itself
synonym.discard('near')
#remove empty strings
synonym = [s for s in synonym if s]

print(synonym)

for i in synonym:
    if len(wn.synsets(i)) != 0:
        syns_synonym = wn.synsets(i)[0]
        print(i, ":", syns_synonym.definition())

['come_near', 'dear', 'close', 'cheeseparing', 'nearly', 'go_up', 'most', 'draw_near', 'good', 'skinny', 'approximate', 'nigh', 'almost', 'come_on', 'well-nigh', 'draw_close', 'about', 'penny-pinching', 'virtually', 'approach']
come_near : almost do or experience something
dear : a beloved person; used as terms of endearment
close : the temporal end; the concluding time
cheeseparing : giving or spending with reluctance
nearly : (of actions or states) slightly short of or not quite accomplished; all but
go_up : move upward
most : (superlative of `many' used with count nouns and often preceded by `the') quantifier meaning the greatest in number
draw_near : move towards
good : benefit
skinny : confidential information about a topic or person
approximate : be close or similar
nigh : not far distant in time or space or degree or circumstances
almost : (of actions or states) slightly short of or not quite accomplished; all but
come_on : appear or become visible; make a showing
well-nigh : (o

In [31]:
sentence = "The bird flew out the window"

# parse with earley parser and cfg
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det N PP
    VP -> V PP | V NP
    PP -> P NP
    Det -> 'The' | 'the' | 'a'
    N -> 'bird' | 'window'
    V -> 'flew'
    P -> 'in' | 'out'
""")

parser = nltk.EarleyChartParser(grammar)

# get N in the PP
parsed = False
parsed_sentence = parser.parse(sentence.split())

#access N node of tree
for tree in parsed_sentence:
    for subtree in tree.subtrees():
        if subtree.label() == 'PP':
            # get the N in the NP in PP recursively
            for sub_subtree in subtree.subtrees():
                if sub_subtree.label() == 'N':
                    print(sub_subtree.leaves())
                    parsed = True
                    break
            
            # print(subtree.leaves())
            parsed = True
            break

# for tree in parser.parse(sentence.split()):
#     parsed = True
#     print(type(tree))
#     print(tree)
#     tree.pretty_print()

# if not parsed:
#    print("The sentence could not be parsed with the given grammar.")


['window']


In [32]:
# df.to_csv('preposition_wordnet_wiki_pop.csv')