# Goodness labels extraction
In this step, we extract all goodness labels from the master file downloaded from the SourceForge site if the language code is either "B" or "I". The extracted data is then saved as a TSV file.

In [77]:
# Import dependencies
import csv

In [78]:
input_filename = './data/wn-msa-all.tab'
output_filename = './data/wn_msa_data.tsv'
indices_to_extract = [0, 2, 3]
allowed_values = ['B', 'I']

with open(input_filename, 'r', encoding='utf-8') as input_file:
    reader = csv.reader(input_file, delimiter='\t')
    next(reader)
    
    extracted_rows = []
    for row in reader:
        if row[1] in allowed_values:
            extracted_row = [row[i] for i in indices_to_extract]
            extracted_rows.append(extracted_row)

with open(output_filename, 'w', encoding='utf-8', newline='') as output_file:
    writer = csv.writer(output_file, delimiter='\t')
    writer.writerow(['synset', 'label', 'lemma'])
    writer.writerows(extracted_rows)

print(f"Data extracted and saved as '{output_filename}'")

Data extracted and saved as './data/wn_msa_data.tsv'


# Labels Inclusion in Development and Evaluation Sets
In this step, we include goodness labels from the "wn-msa-all.tab" data in both the development and evaluation sets to perform data analysis and formulate conditions for the system. If the goodness labels are available in both sets, it indicates that the goodness labels could be a valuable addition to the conditions.

In [81]:
# Define the input filenames and change path to development and evaluation sets
development_set_filename = './data/development_set.tsv'
wn_msa_all_filename = './data/wn_msa_data.tsv'

# Define the output filename
output_filename = './data/development_set_with_labels.tsv'

# Read the data from wn_msa_all.tsv and create a dictionary with synset-lemma as key-value pairs
wn_msa_all_data = {}
with open(wn_msa_all_filename, 'r', encoding='utf-8') as wn_msa_all_file:
    reader = csv.reader(wn_msa_all_file, delimiter='\t')
    next(reader)  
    for row in reader:
        synset = row[0]
        lemma = row[2]
        wn_msa_all_data[(synset, lemma)] = row[1]

# Open the development_set.tsv for reading and create a new file for writing the updated data
with open(development_set_filename, 'r', encoding='utf-8') as development_set_file, \
        open(output_filename, 'w', encoding='utf-8', newline='') as output_file:
    reader = csv.reader(development_set_file, delimiter='\t')
    writer = csv.writer(output_file, delimiter='\t')
    header_row = next(reader)
    writer.writerow(header_row + ['goodness label'])

    for row in reader:
        synset = row[0]
        lemma = row[1]

        # Check if the synset-lemma combination exists in wn_msa_all_data
        if (synset, lemma) in wn_msa_all_data:
            # Get the corresponding goodness label from wn_msa_all_data
            goodness_label = wn_msa_all_data[(synset, lemma)]
        else:
            # No match found, set goodness label as 'None'
            goodness_label = 'None'
        writer.writerow(row + [goodness_label])

print(f"Development set with labels saved as '{output_filename}'")

Development set with labels saved as './data/development_set_with_labels.tsv'


# Data analysis
The purpose is to count how many goodness labels for each annotation DELETE and KEEP in the development set to figure out whether goodness labels could be good addition into the condition or not.

In [82]:
file_path_development = './data/development_set_with_labels.tsv'

# Create empty counters for each label
keep_counts = {"None": 0, "Y": 0, "O": 0, "M": 0, "L": 0, "X": 0}
delete_counts = {"None": 0, "Y": 0, "O": 0, "M": 0, "L": 0, "X": 0}


with open(file_path_development, 'r') as file:
    next(file)
    
    for line in file:
        synset, lemma, annotation, goodness_label = line.strip().split('\t')
        if annotation == "KEEP":
            keep_counts[goodness_label] += 1
        elif annotation == "DELETE":
            delete_counts[goodness_label] += 1

print("KEEP:", sum(keep_counts.values()))
for label, count in keep_counts.items():
    print(label + ":", count)

print("DELETE:", sum(delete_counts.values()))
for label, count in delete_counts.items():
    print(label + ":", count)



KEEP: 6131
None: 1838
Y: 1066
O: 2543
M: 20
L: 192
X: 472
DELETE: 1215
None: 81
Y: 57
O: 848
M: 0
L: 2
X: 227


In [72]:
filename = './data/development_set_with_labels.tsv'
x_v_keep = 0
x_v_delete = 0 
o_v_keep = 0
o_v_delete = 0

with open(filename, "r", newline="") as tsv_file:
    tsv_reader = csv.DictReader(tsv_file, delimiter="\t")
    for row in tsv_reader:
        label = row["annotation"]
        goodness = row['goodness label']
        synset = row['synset']
        #synset	lemma	annotation	goodness label
        if goodness == 'X' and synset.endswith('-v'):
            if label == "KEEP":
                x_v_keep += 1
            elif label == "DELETE":
                x_v_delete += 1
        if goodness == 'O' and synset.endswith('-v'):
            if label == "KEEP":
                o_v_keep += 1
            elif label == "DELETE":
                o_v_delete += 1
print(x_v_keep)
print(x_v_delete)
print(o_v_keep)
print(o_v_delete)

140
108
691
639


# Counts the sense POS type in the development set for data analysis in dev_set

This it to support the formulation of condition 5 of the system

In [1]:
filename = './data/development_set_with_labels.tsv'
suffixes = ['r', 'n', 'v', 'a']
keep_counts_r = {}
keep_counts_n = {}
keep_counts_v = {}
keep_counts_a = {}
delete_counts_r = {}
delete_counts_n = {}
delete_counts_v = {}
delete_counts_a = {}

with open(filename, 'r', newline='') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    next(reader)  
    for row in reader:
        synset, lemma, annotation, goodness_label = row
        
        if synset.endswith('r'):
            if annotation == 'KEEP':
                if goodness_label in keep_counts_r:
                    keep_counts_r[goodness_label] += 1
                else:
                    keep_counts_r[goodness_label] = 1
            elif annotation == 'DELETE':
                if goodness_label in delete_counts_r:
                    delete_counts_r[goodness_label] += 1
                else:
                    delete_counts_r[goodness_label] = 1
        elif synset.endswith('n'):
            if annotation == 'KEEP':
                if goodness_label in keep_counts_n:
                    keep_counts_n[goodness_label] += 1
                else:
                    keep_counts_n[goodness_label] = 1
            elif annotation == 'DELETE':
                if goodness_label in delete_counts_n:
                    delete_counts_n[goodness_label] += 1
                else:
                    delete_counts_n[goodness_label] = 1
        elif synset.endswith('v'):
            if annotation == 'KEEP':
                if goodness_label in keep_counts_v:
                    keep_counts_v[goodness_label] += 1
                else:
                    keep_counts_v[goodness_label] = 1
            elif annotation == 'DELETE':
                if goodness_label in delete_counts_v:
                    delete_counts_v[goodness_label] += 1
                else:
                    delete_counts_v[goodness_label] = 1
        elif synset.endswith('a'):
            if annotation == 'KEEP':
                if goodness_label in keep_counts_a:
                    keep_counts_a[goodness_label] += 1
                else:
                    keep_counts_a[goodness_label] = 1
            elif annotation == 'DELETE':
                if goodness_label in delete_counts_a:
                    delete_counts_a[goodness_label] += 1
                else:
                    delete_counts_a[goodness_label] = 1

print('Keep Counts:')
for suffix, count_dict in [('r', keep_counts_r), ('n', keep_counts_n), ('v', keep_counts_v), ('a', keep_counts_a)]:
    for label, count in count_dict.items():
        print(f''{suffix}' {label}: {count}')

print('\nDelete Counts:')
for suffix, count_dict in [('r', delete_counts_r), ('n', delete_counts_n), ('v', delete_counts_v), ('a', delete_counts_a)]:
    for label, count in count_dict.items():
        print(f''{suffix}' {label}: {count}')


Keep Counts:
'r' O: 144
'r' None: 102
'r' L: 18
'r' X: 31
'r' M: 2
'r' Y: 14
'n' O: 1142
'n' None: 1055
'n' Y: 600
'n' X: 222
'n' L: 77
'n' M: 11
'v' X: 147
'v' None: 333
'v' O: 687
'v' Y: 221
'v' L: 64
'v' M: 3
'a' O: 570
'a' X: 72
'a' None: 309
'a' Y: 231
'a' L: 33
'a' M: 4

Delete Counts:
'r' O: 22
'r' X: 17
'r' None: 7
'r' Y: 1
'n' O: 153
'n' X: 87
'n' None: 41
'n' Y: 11
'v' O: 632
'v' X: 98
'v' Y: 39
'v' None: 24
'v' L: 2
'a' X: 25
'a' None: 9
'a' O: 41
'a' Y: 6


In [83]:
# Initialize counters
count_n_keep = 0
count_v_keep = 0
count_r_keep = 0
count_a_keep = 0
count_x_keep = 0
count_s_keep = 0
count_n_delete = 0
count_v_delete = 0
count_r_delete = 0
count_a_delete = 0
count_x_delete = 0
count_s_delete = 0
count_synset_old = 0

with open('./data/development_set_with_labels.tsv', 'r') as tsv_file:
    reader = csv.DictReader(tsv_file, delimiter='\t')
    for row in reader:
        synset = row['synset']
        annotation = row['annotation']
        if synset.endswith(('-n', '-v', '-r', '-a', '-x', '-s')):
            if annotation == 'KEEP':
                if synset.endswith('-n'):
                    count_n_keep += 1
                elif synset.endswith('-v'):
                    count_v_keep += 1
                elif synset.endswith('-r'):
                    count_r_keep += 1
                elif synset.endswith('-a'):
                    count_a_keep += 1
                elif synset.endswith('-x'):
                    count_x_keep += 1
                elif synset.endswith('-s'):
                    count_s_keep += 1
            elif annotation == 'DELETE':
                if synset.endswith('-n'):
                    count_n_delete += 1
                elif synset.endswith('-v'):
                    count_v_delete += 1
                elif synset.endswith('-r'):
                    count_r_delete += 1
                elif synset.endswith('-a'):
                    count_a_delete += 1
                elif synset.endswith('-x'):
                    count_x_delete += 1
                elif synset.endswith('-s'):
                    count_s_delete += 1
        else:
            count_synset_old += 1

print('KEEP:')
print('-n:', count_n_keep)
print('-v:', count_v_keep)
print('-r:', count_r_keep)
print('-a:', count_a_keep)
print('-x:', count_x_keep)
print('-s:', count_s_keep)


print('DELETE:')
print('-n:', count_n_delete)
print('-v:', count_v_delete)
print('-r:', count_r_delete)
print('-a:', count_a_delete)
print('-x:', count_x_delete)
print('-s:', count_s_delete)
print('-synset_old:', count_synset_old)


KEEP:
-n: 3107
-v: 1455
-r: 311
-a: 1219
-x: 34
-s: 5
DELETE:
-n: 292
-v: 795
-r: 47
-a: 81
-x: 0
-s: 0
-synset_old: 0


In [4]:
count_n = 0
count_v = 0
count_r = 0
count_a = 0
count_x = 0
count_s = 0

goodness_labels = ['Y', 'O', 'M', 'L', 'X', 'None']
with open('./data/development_set_with_labels.tsv', 'r') as tsv_file:
    reader = csv.DictReader(tsv_file, delimiter='\t')
    for row in reader:
        synset = row['synset']
        annotation = row['annotation']
        goodness_label = row['goodness label']
        
        if annotation == 'DELETE' and goodness_label == 'X': #in goodness_labels:
            if synset.endswith(('-n', '-v', '-r', '-a', '-x', '-s')):
                if synset.endswith('-n'):
                    count_n += 1
                elif synset.endswith('-v'):
                    count_v += 1
                elif synset.endswith('-r'):
                    count_r += 1
                elif synset.endswith('-a'):
                    count_a += 1
                elif synset.endswith('-x'):
                    count_x += 1
                elif synset.endswith('-s'):
                    count_s += 1
print('-n:', count_n)
print('-v:', count_v)
print('-a:', count_a)
print('-r:', count_r)
print('-x:', count_x)
print('-s:', count_s)


-n: 87
-v: 98
-a: 25
-r: 17
-x: 0
-s: 0
