In [None]:
# Import dependencies
import csv
import random

## Removing duplicates from second and third file

In [7]:
#Second file

with open('./data/omw-wnbahasa-deleted-indonesian.tsv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')
    data = list(reader)  

unique_entries = []
seen_entries = set()
for row in data[1:]:  
    sense_lemma = (row[0], row[1])
    if sense_lemma not in seen_entries:
        unique_entries.append(row)
        seen_entries.add(sense_lemma)

with open('./data/omw-wnbahasa-deleted-cleaned.tsv', 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(['synset_old', 'lemma', 'src_old', 'usr_old'])  # Write the header row
    writer.writerows(unique_entries)

print('Duplicate entries removed and saved as ./data/omw-wnbahasa-deleted-cleaned.tsv')
print('===================')

# Third file
with open( './data/ntumc-omw-wnbahasa-used.tsv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')
    data = list(reader)  

unique_entries = []
seen_entries = set()
for row in data[1:]: 
    sense_lemma = (row[0], row[1])
    if sense_lemma not in seen_entries:
        unique_entries.append(row)
        seen_entries.add(sense_lemma)

with open('./data/ntumc-omw-wnbahasa-used-cleaned.tsv', 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(['tag', 'clemma'])  
    writer.writerows(unique_entries)

print('Duplicate entries removed and saved as ./data/ntumc-omw-wnbahasa-used-cleaned.tsv')

Duplicate entries removed and saved as ./data/omw-wnbahasa-deleted-cleaned.tsv
Duplicate entries removed and saved as ./data/ntumc-omw-wnbahasa-used-cleaned.tsv


## Combining first, second and third files for development and evaluation sets

In [2]:
first_file = './data/omw-wn-bahasa-indonesian.tsv'
second_file = './data/omw-wnbahasa-deleted-cleaned.tsv'
third_file = './data/ntumc-omw-wnbahasa-used-cleaned.tsv'

used_data = []
deleted_data = []
indonesian_data = []

with open(third_file, 'r') as file:
    next(file) 
    for line in file:
        tag, clemma = line.strip().split('\t')
        used_data.append((tag, clemma, 'KEEP'))

with open(second_file, 'r') as file:
    next(file) 
    for line in file:
        synset_old, lemma, src_old, usr_old = line.strip().split('\t')
        deleted_data.append((synset_old, lemma, 'DELETE'))

with open(first_file, 'r') as file:
    next(file)  
    for line in file:
        row = line.strip().split('\t')
        if len(row) == 4:
            synset, lemma, src, confidence = row
            usr = ''
        else:
            synset, lemma, src, confidence, usr = row

        if src == 'ntumc':
            indonesian_data.append((synset, lemma, 'KEEP'))

# Shuffle the KEEP and DELETE rows separately
random.shuffle(used_data)
random.shuffle(deleted_data)
random.shuffle(indonesian_data)

combined_data = used_data + deleted_data + indonesian_data

# Shuffle the combined list
random.shuffle(combined_data)

split_index = int(len(combined_data) * 0.6)

development_set = combined_data[:split_index]
evaluation_set = combined_data[split_index:]

# Add headers to the sets
development_set.insert(0, ('synset', 'lemma', 'annotation'))
evaluation_set.insert(0, ('synset', 'lemma', 'annotation'))

development_file_path = './data/development_data.tsv'
with open(development_file_path, 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerows(development_set)
print(f'Development set saved to: {development_file_path}')

evaluation_file_path = './data/evaluation_data.tsv'
with open(evaluation_file_path, 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerows(evaluation_set)
    
print(f'Evaluation set saved to: {evaluation_file_path}')

Development set saved to: ./data/development_data.tsv
Evaluation set saved to: ./data/evaluation_data.tsv


# Remove duplicates in development data and evaluation data

In [3]:
def remove_duplicates(input_file, output_file):
    """
    Function to remove duplicate rows from the input TSV file for labels with 'KEEP' annotation.

    Parame:
        input_file (str): The path to the input TSV file containing data in the format of (synset, lemma, annotation).
        output_file (str): The path to the output TSV file where the updated data will be saved.

    Returns:
        None
    """
    data = []
    seen_keep = set()
    with open(input_file, 'r', newline='') as file:
        reader = csv.reader(file, delimiter='\t')
        header = next(reader)  
        for row in reader:
            synset, lemma, annotation = row
            if annotation == 'KEEP' and (synset, lemma) not in seen_keep:
                data.append(row)
                seen_keep.add((synset, lemma))
            elif annotation == 'DELETE':
                data.append(row)

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerows([header])  
        writer.writerows(data) 

    print(f'Data saved to: {output_file}')

development_input_file = './data/development_data.tsv'
development_output_file = './data/development_set.tsv'

evaluation_input_file = './data/evaluation_data.tsv'
evaluation_output_file = './data/evaluation_set.tsv'

# Remove duplicates for development data
remove_duplicates(development_input_file, development_output_file)
# Remove duplicates for evaluation data
remove_duplicates(evaluation_input_file, evaluation_output_file)


Data saved to: ./data/development_set.tsv
Data saved to: ./data/evaluation_set.tsv


# Data analysis
In this step, we are analyzing the goodness labels and other data analysis in the purpose of formulating better conditions for the system.

In [4]:
def analysis(file_path):
    """
    Function to perform analysis on a dataset file.

    This function reads a dataset file located at the given file_path, and performs analysis on its content. It calculates
    the total number of rows in the file, the count of rows with 'KEEP' annotation, and the count of rows with 'DELETE'
    annotation. It also counts the distinct synsets and lemmas found in the dataset.

    Param:
        file_path (str): The path to the dataset file.

    Returns:
        None
    """
    total_rows = 0
    keep_count = 0
    delete_count = 0
    synset_set = set()
    lemma_set = set()

    with open(file_path, 'r') as file:
        next(file)
        for line in file:
            synset, lemma, annotation = line.strip().split('\t')
            if annotation == 'KEEP':
                keep_count += 1
            elif annotation == 'DELETE':
                delete_count += 1
            synset_set.add(synset)
            lemma_set.add(lemma)

            total_rows += 1
    print(f'Total rows: {total_rows}')
    print(f'KEEP count: {keep_count}')
    print(f'DELETE count: {delete_count}')
    print(f'Distinct synsets: {len(synset_set)}')
    print(f'Distinct lemmas: {len(lemma_set)}')

print('Evaluation Set')
file_path = './data/evaluation_set.tsv'
analysis(file_path)

print('======================================================================')
print('Development Set')
file_path = './data/development_set.tsv'
analysis(file_path)


Evaluation Set
Total rows: 4924
KEEP count: 4097
DELETE count: 827
Distinct synsets: 3573
Distinct lemmas: 3446
Development Set
Total rows: 7346
KEEP count: 6131
DELETE count: 1215
Distinct synsets: 4898
Distinct lemmas: 4772
