Extract information from WordNet

In [1]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# rename the folder path before running
project_folder = 'C:\\Users\\tkhishigsure\\OneDrive - The University of Melbourne\\Documents\\github\\lexical_elaboration'

In [3]:
# load words of interest
focus_list = set()
with open(project_folder+'\\data\\biladataset\\bila_long_noun_full.csv', 'r', encoding='utf-8') as f:
    for line in f:
      parts = line.rstrip("\n").split(",")
      focus_list.add(parts[1])

In [4]:
total_words = len(focus_list)
present_in_wordnet = 0
absent_from_wordnet = 0

for word in focus_list:
    if wn.synsets(word):
        present_in_wordnet += 1
    else:
        absent_from_wordnet += 1

print("Total nouns in BILA:", total_words)
print("Nouns present in WordNet:", present_in_wordnet)
print("Nouns absent from WordNet:", absent_from_wordnet)

Total nouns in BILA: 12040
Nouns present in WordNet: 11889
Nouns absent from WordNet: 151


In [5]:
# lemmatize each word in the focus list
lemmatizer = WordNetLemmatizer()
lemmatized_words = [(word, lemmatizer.lemmatize(word, pos='n')) for word in focus_list]

df = pd.DataFrame(lemmatized_words, columns=['original_word', 'lemmatized_word'])

Extract information about the number of senses.

In [6]:
lemmatized_list = df['lemmatized_word']

# calculate number of senses and number of compounds for each word in the focus list
sense_df = []
for word in lemmatized_list:
    nsenses = len(wn.synsets(word))
    sense_df.append({'lemmatized_word': word, 'nsenses': int(nsenses)})

sense_df = pd.DataFrame(sense_df)

merged_df = df.merge(sense_df, on='lemmatized_word', how='left')
merged_df = merged_df.drop_duplicates()

In [7]:
merged_df.to_csv(project_folder+'\\data\\forpreprocessing\\lemma_features.tsv', sep='\t', index=False, encoding='utf-8')