# Dataset labeller

This tool helps the process of annotating chunk data (512-token size) and sentences within a paper

## Labelling Guidelines
Only label 1 = Yes if it:
- Includes specific terms like "Delta variant" or any specific variant name
- Includes terms like "mutation", "viral variant", "strain", "variant of concern" and "genetic variant"
- Describes characteristics, behaviours, or impacts of the variants, even if not named explicitly
- Compares different variants
- Discusses genetic mutations or sequences related to viral variants
- Discusses spread, transmission rates or infection rates associated with specific variants
- Discusses how variants affect vaccine efficacy
- Describes changes in symptom severity due to variants
- Discusses health outcomes associated with variants
- Discusses public health measures or responses tailored to specific variants
- Discusses time and place of variant emergence and spread


In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize

In [1]:
# Label each text in dataframe
def get_label(df, column):
    label_input=[]
    for i in df[column]:
        print(i)
        
        while True:
            label = int(input('Does this relate to viral variants? (0 = No, 1 = Yes)'))
            if label == 0 or label == 1:
                break
            
        label_input.append(label)
        print(" ")
    df['label'] = label_input
    return df

# Labelling 512-token chunks
Label all chunks in dataset and save progress every 10 chunks

In [None]:
# LABEL CHUNKS
# Obtain list of sentences 
df = pd.read_csv("chunks_dataset.csv")
# text = df.sample(n=3)

# Split dataframe into smaller dataframes to work with
df_split = np.array_split(df,30)

# Instantiate dataframe
result = pd.DataFrame(columns=["text", "label"])

# Iterate through each split
for i, split in enumerate(df_split):
    # if i < 3:
    #     continue
    print("Current split " + str(i))
    labelled_chunks = get_label(split, 'text')
    filename = str(i) + "_chunks_labelled.csv"
    labelled_chunks.to_csv(filename)
    # labelled_chunks["position"] = i
    labelled_chunks.to_csv(filename)
    print("Finished split " + str(i))
    print("============================================================")
    # Combine labelled_split with result
    result = pd.concat([result, labelled_chunks])
    
result.to_csv("labelled_chunks.csv")
# print("Finished labelling " + str(text_index) + " papers.")
# print(labelled_chunks)

# Labelling sentences within paper

In [None]:
# # Load data and create example text
# df = pd.read_csv('bn_pub_dataset_3.csv')

# # Check class distribution
# print(df['label'].value_counts())

# # Balance classes if needed
# df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)

# # Shuffle the dataset
# df = df.sample(frac=1, random_state=42)
# df = df[["text", "label"]]

# subsample = df.sample(n=15, random_state=42)

# # LABEL SENTENCES
# # Obtain list of sentences 
# text = subsample["text"]

# # Testing with simple test case
# # example = ["hisi is cool. But I think I am cooler.", "www.wikipedia.com contains a lot of interesting information. John should definitely visit it."]
# # text = pd.DataFrame(example, columns=['text'])
# # text = text["text"]

# text_index = 0

# # Iterate through each paper
# for i in text:

#     if text_index < 2:
#         text_index += 1
#         continue
    
#     # Split text into sentences
#     sentences = sent_tokenize(i)

#     # For test purposes ONLY
#     # sentences = sentences[:100]

#     # Create dataframe containing all sentences in text
#     sentences_df = pd.DataFrame(sentences, columns=['text'])

#     # Split dataframe into smaller dataframes to work with
#     sentences_df_split = np.array_split(sentences_df,10)

#     # Instantiate dataframe
#     result = pd.DataFrame(columns=["text", "label"])

#     # Save dataframe for current paper
#     filename = str(text_index) + "_sentence_labelled.csv"
    
#     # Iterate through the split
#     for split in sentences_df_split:
#         # Manually label each sentence in the split
#         labelled_split = get_label(split, 'text')

#         # Combine labelled_split with result
#         result = pd.concat([result, labelled_split])

#         # Save current results (checkpoint)
#         result.to_csv(filename)
        
#     # Manually label each sentence
#     # sentences_df = get_label(sentences_df, 'text')

#     # Save dataframe for current paper
#     filename = str(text_index) + "_sentence_labelled.csv"
#     # sentences_df.to_csv(filename)

#     # Point to next paper
#     text_index += 1

#     print("Moving on to paper #" + str(text_index) + "\n")

# print("Finished labelling " + str(text_index) + " papers.")