Extract information from WordNet

In [1]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# rename the folder path before running
project_folder = 'C:\\Users\\tkhishigsure\\OneDrive - The University of Melbourne\\Documents\\github\\lexical_elaboration'

In [3]:
# load words of interest
focus_list = set()
#with open(project_folder+'\\output\\bila_long_nounverbadj.csv', 'r', encoding='utf-8') as f:
with open(project_folder+'\\output\\bila_long_noun.csv', 'r', encoding='utf-8') as f:
    for line in f:
      parts = line.rstrip("\n").split(",")
      focus_list.add(parts[1])

In [4]:
total_words = len(focus_list)
present_in_wordnet = 0
absent_from_wordnet = 0

for word in focus_list:
    if wn.synsets(word):
        present_in_wordnet += 1
    else:
        absent_from_wordnet += 1

print("Total nouns in BILA:", total_words)
print("Nouns present in WordNet:", present_in_wordnet)
print("Nouns absent from WordNet:", absent_from_wordnet)

Total nouns in BILA: 11576
Nouns present in WordNet: 11442
Nouns absent from WordNet: 134


In [5]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each word in the focus list
lemmatized_words = [(word, lemmatizer.lemmatize(word)) for word in focus_list]

# Create a DataFrame
df = pd.DataFrame(lemmatized_words, columns=['original_word', 'lemmatized_word'])

Extract number of senses, compounds used in, synonyms, and cases used as a synonym

In [7]:
lemmatized_list = df['lemmatized_word']

# create a list of compounds
def get_compound_list():
    compound_list = []
    for synset in wn.all_synsets():
        for lemma in synset.lemmas():
            if '_' in lemma.name():
                parts = lemma.name().split('_')
                compound_list.append(parts)
    return compound_list
    
compound_word_list = get_compound_list()

# calculate number of senses and number of compounds for each word in focus list
sense_df = []
for word in lemmatized_list:
    nsenses = len(wn.synsets(word))
    ncompounds = sum(word in compound_parts for compound_parts in compound_word_list)
    sense_df.append({'lemmatized_word': word, 'nsenses': int(nsenses), 'ncompounds': int(ncompounds)})

sense_df = pd.DataFrame(sense_df)

In [8]:
# Left join df_unique and sense_df on the 'lemmatized_word' column
merged_df = df.merge(sense_df, on='lemmatized_word', how='left')
merged_df = merged_df.drop_duplicates()

# Display the resulting DataFrame
print(merged_df)

      original_word lemmatized_word  nsenses  ncompounds
0           lattice         lattice        3           3
1        perennials       perennial        4           4
2           interns          intern        3           1
4            galley          galley        4           4
5            corona          corona        6           1
...             ...             ...      ...         ...
17784      examples         example        6           3
17785        surges           surge        8           3
17787        rabbis           rabbi        2           0
17789         slabs            slab        1           0
17791   restitution     restitution        3           0

[11576 rows x 4 columns]


In [9]:
word_column = merged_df['lemmatized_word'] 
all_words = set(merged_df['lemmatized_word'])
synonyms_dict = {}

for word in word_column:
    synonyms = []
    for syn in wn.synsets(word, pos=wn.NOUN):
        for lemma in syn.lemmas():
            # Add synonym to the list only if it is present in word_column
            if lemma.name() in all_words and lemma.name() != word:
                synonyms.append(lemma.name())
    # Remove duplicates and sort
    synonyms = sorted(list(set(synonyms)))
    synonyms_dict[word] = synonyms


backsyn_nums = {}
for word, synonyms in synonyms_dict.items():
    for synonym in synonyms:
        if synonym in all_words:
            if synonym in backsyn_nums:
                backsyn_nums[synonym] +=1
            else:
                backsyn_nums[synonym] = 1

def printResults(merged_df, synonyms_dict, bsyn):
    with open(project_folder+'\\output\\forpreprocessing\\tokens_wn.tsv', 'w', encoding='utf-8') as f:
        f.write("original_word\tlemmatized_word\tnsenses\tncompounds\tnbacksyns\tnsynonyms\tsynonym_list\n")
        for index, item in merged_df.iterrows(): 
            nsyns = 0
            nbacksyns = 0
            syn_list = ''
            if item['lemmatized_word'] in synonyms_dict:
                nsyns = len(synonyms_dict[item['lemmatized_word']])
                syn_list = str(synonyms_dict[item['lemmatized_word']])
            if item['lemmatized_word'] in bsyn:
                nbacksyns = bsyn[item['lemmatized_word']]
            f.write(str(item['original_word'])+"\t"+str(item['lemmatized_word'])+"\t"+str(item['nsenses'])+"\t"+str(item['ncompounds'])+"\t"+str(nbacksyns)+"\t"+str(nsyns)+"\t"+syn_list+"\n")

printResults(merged_df, synonyms_dict, backsyn_nums)

In [None]:
# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Create the density plot
sns.kdeplot(data=merged_df['nsenses'], shade=True)

# Add labels and title
plt.xlabel('Number of Senses')
plt.ylabel('Density')

# Show plot
plt.show()

In [38]:
def get_domain_info(word):
    synsets = wn.synsets(word)
    domains = []
    for synset in synsets:
        hypernym_paths = synset.hypernym_paths()
        print(f"Word: {word}, Synset: {synset}, Hypernym Paths: {hypernym_paths}")
        if hypernym_paths and len(hypernym_paths[0]) > 1:  # Check if the list is not empty and has more than one element
            domains.extend(hypernym_paths[0][1].lemma_names())  # Extracting domain information from hypernyms
    return list(set(domains))  # Returning unique domain information

filtered_df['domain_info'] = filtered_df['word'].apply(get_domain_info)

print(filtered_df)

Word: inaccurate, Synset: Synset('inaccurate.a.01'), Hypernym Paths: [[Synset('inaccurate.a.01')]]
Word: warranty, Synset: Synset('guarantee.n.01'), Hypernym Paths: [[Synset('entity.n.01'), Synset('abstraction.n.06'), Synset('communication.n.02'), Synset('message.n.02'), Synset('commitment.n.04'), Synset('assurance.n.02'), Synset('guarantee.n.01')]]
Word: earrings, Synset: Synset('earring.n.01'), Hypernym Paths: [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('decoration.n.01'), Synset('adornment.n.01'), Synset('jewelry.n.01'), Synset('earring.n.01')]]
Word: gaunt, Synset: Synset('bony.s.01'), Hypernym Paths: [[Synset('bony.s.01')]]
Word: globalization, Synset: Synset('globalization.n.01'), Hypernym Paths: [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('process.n.06'), Synset('economic_process.n.01'), Synset('globalization.n.01')]]
Word: internet, Synset: Synset('internet.n.01'), Hype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['domain_info'] = filtered_df['word'].apply(get_domain_info)
