# Create data with shorter text

In [1]:
import glob
import os
import pandas as pd
import xml.etree.ElementTree as ET

# configuration
annotation_path = 'distribution/annotation_files/'
raw_data_path = 'distribution/raw_abstract_txt/'
############################################

domain_name = 'Computational Linguistics'
pattern = os.path.join(annotation_path, '*[0-9]*')

pd_ds = pd.DataFrame(columns=['doc_id', 'doc_segment_id', 'annotator_id', 'text', 'label', 'category', 'domain'])


for path in glob.glob(pattern):
    annotator_id = int(os.path.basename(path)[-1])
    for doc in os.listdir(path):
        pattern = os.path.join(path, doc, '*.xml')
        for file in glob.glob(pattern):
            abstract_file_path = file.replace(os.path.join('annotation_files', f'annotator{annotator_id}'), 'raw_abstract_txt')
            abstract_tree = ET.parse(abstract_file_path)
            abstract_root = abstract_tree.getroot()
            abstract_texts = list(map(lambda x: x.text, abstract_root.findall('.//S')))

            tree = ET.parse(file)
            root = tree.getroot()
            annotation_texts = list(map(lambda x: x.text, root.findall('.//S')))
            if len(abstract_texts) != len(annotation_texts):
                raise ValueError(f'Length of abstract_texts and annotation_texts are different: {len(abstract_texts)} != {len(annotation_texts)}')
            
            doc_segment_id = os.path.basename(file).split('.')[0]

            for i in root.iter():
                if i.tag == 'S':
                    text = abstract_texts.pop(0)
    
                if i.tag == 'term':
                    category = i.attrib['class']
                    term = i.text
                    pd_ds.loc[len(pd_ds)] = [doc, doc_segment_id, annotator_id, text, term, category, domain_name]

pd_ds.head(10)


Unnamed: 0,doc_id,doc_segment_id,annotator_id,text,label,category,domain
0,87,P87-1022_abstr,2,In this paper we present a formalization of th...,centering approach,tech,Computational Linguistics
1,87,P87-1022_abstr,2,In this paper we present a formalization of th...,attentional structure,other,Computational Linguistics
2,87,P87-1022_abstr,2,In this paper we present a formalization of th...,discourse,other,Computational Linguistics
3,87,P87-1022_abstr,2,In this paper we present a formalization of th...,discourse context,other,Computational Linguistics
4,87,P87-1022_abstr,2,In this paper we present a formalization of th...,pronouns,other,Computational Linguistics
5,87,P87-1022_abstr,2,"As described in [GJW86], the process of center...",attention,other,Computational Linguistics
6,87,P87-1022_abstr,2,"As described in [GJW86], the process of center...",entities,other,Computational Linguistics
7,87,P87-1022_abstr,2,"As described in [GJW86], the process of center...",discourse,other,Computational Linguistics
8,87,P87-1022_abstr,2,"As described in [GJW86], the process of center...",intersentential transitional states,other,Computational Linguistics
9,87,P87-1022_abstr,2,"As described in [GJW86], the process of center...",continuing,other,Computational Linguistics


In [2]:
delimiter = ';;; '
pd.set_option('display.max_colwidth', 500)
grouped_df = pd_ds.groupby(['text', 'annotator_id']).agg({
    'domain': 'first',
    'doc_id': 'first',
    'doc_segment_id': 'first',
    'category': delimiter.join,  # Assuming you want unique categories joined by space
    'label': delimiter.join
}).reset_index()

grouped_df['category'] = grouped_df['category'].apply(lambda x: x.split(delimiter))
grouped_df['label'] = grouped_df['label'].apply(lambda x: x.split(delimiter))
grouped_df.head(10)

Unnamed: 0,text,annotator_id,domain,doc_id,doc_segment_id,category,label
0,"(Ramshaw and Marcus, 1995) have introduced a convenient data representation for chunking by converting it to a tagging task.",2,Computational Linguistics,99,E99-1023_abstr,"[tech, tech, tech]","[data representation, chunking, tagging task]"
1,(ii) High quality translation via word sense disambiguation and accurate word order generation of the target language.,1,Computational Linguistics,1,H01-1041_abstr,"[tech, tech, tech, other]","[translation, word sense disambiguation, word order generation, target language]"
2,(ii) High quality translation via word sense disambiguation and accurate word order generation of the target language.,2,Computational Linguistics,1,H01-1041_abstr,"[tech, tech, tech, other]","[translation, word sense disambiguation, word order generation, target language]"
3,(iii) Rapid system development and porting to new domains via knowledge-based automated acquisition of grammars.,1,Computational Linguistics,1,H01-1041_abstr,"[tech, other, tech]","[Rapid system development, domains, knowledge-based automated acquisition of grammars]"
4,(iii) Rapid system development and porting to new domains via knowledge-based automated acquisition of grammars.,2,Computational Linguistics,1,H01-1041_abstr,"[tech, tech, tech]","[system development, porting to new domains, knowledge-based automated acquisition of grammars]"
5,/Soames 1979/ provides some counterexamples to the theory of natural language presuppositions that is presented in /Gazdar 1979/.,1,Computational Linguistics,88,C88-2086_abstr,[other],[theory of natural language presuppositions]
6,/Soames 1979/ provides some counterexamples to the theory of natural language presuppositions that is presented in /Gazdar 1979/.,2,Computational Linguistics,88,C88-2086_abstr,[other],[natural language presuppositions]
7,A 'lexicalized' grammar naturally follows from the extended domain of locality of TAGs.,2,Computational Linguistics,89,H89-1036_abstr,"[lr, other, tech]","['lexicalized' grammar, extended domain of locality, TAGs]"
8,A central problem of word sense disambiguation (WSD) is the lack of manually sense-tagged data required for supervised learning.,1,Computational Linguistics,3,P03-1058_abstr,"[tech, lr, tech]","[word sense disambiguation (WSD), manually sense-tagged data, supervised learning]"
9,A central problem of word sense disambiguation (WSD) is the lack of manually sense-tagged data required for supervised learning.,2,Computational Linguistics,3,P03-1058_abstr,"[tech, lr, tech]","[word sense disambiguation (WSD), manually sense-tagged data, supervised learning]"


In [3]:
annotator_1_df = grouped_df[grouped_df['annotator_id'] == 1].reset_index()
annotator_2_df = grouped_df[grouped_df['annotator_id'] == 2].reset_index()

In [4]:
print(len(annotator_1_df))
print(len(annotator_2_df))

870
1242


In [5]:
indexes = []
for i, annotator_1_df_text, annotator_1_df_terms in zip(annotator_1_df.index, annotator_1_df['text'], annotator_1_df['label']):
    for annotator_2_df_text, annotator_2_df_terms in zip(annotator_2_df['text'], annotator_2_df['label']):
        if annotator_1_df_terms == annotator_2_df_terms and annotator_1_df_text == annotator_2_df_text:
            indexes.append(i)
            
agreed_df = annotator_1_df.iloc[indexes]
agreed_df.head(10)

Unnamed: 0,index,text,annotator_id,domain,doc_id,doc_segment_id,category,label
0,1,(ii) High quality translation via word sense disambiguation and accurate word order generation of the target language.,1,Computational Linguistics,1,H01-1041_abstr,"[tech, tech, tech, other]","[translation, word sense disambiguation, word order generation, target language]"
3,8,A central problem of word sense disambiguation (WSD) is the lack of manually sense-tagged data required for supervised learning.,1,Computational Linguistics,3,P03-1058_abstr,"[tech, lr, tech]","[word sense disambiguation (WSD), manually sense-tagged data, supervised learning]"
4,11,A customized interface for browsing and editing was also designed and implemented.,1,Computational Linguistics,90,J90-3002_abstr,[tech],[interface]
7,21,"A grammar model for concurrent, object-oriented natural language parsing is introduced.",1,Computational Linguistics,94,C94-1061_abstr,"[model, tech]","[grammar model, concurrent, object-oriented natural language parsing]"
8,24,A language learning experiment showed that assessors can differentiate native from non-native language essays in less than 100 words.,1,Computational Linguistics,1,H01-1042_abstr,"[other, other, other, other]","[language learning experiment, assessors, native from non-native language essays, words]"
10,30,A method for producing such phrases from a word-aligned corpora is proposed.,1,Computational Linguistics,5,H05-1095_abstr,"[other, lr]","[phrases, word-aligned corpora]"
12,39,"A new, flexible inference method for Horn logic program is proposed.",1,Computational Linguistics,94,C94-1077_abstr,"[tech, other]","[inference method, Horn logic program]"
13,41,A novel bootstrapping approach to Named Entity (NE) tagging using concept-based seeds and successive learners is presented.,1,Computational Linguistics,3,N03-2025_abstr,"[tech, tech, other, tech]","[bootstrapping approach, Named Entity (NE) tagging, concept-based seeds, successive learners]"
15,45,"A novel method for adding linguistic annotation to corpora is presented which involves using a statistical POS tagger in conjunction with unsupervised structure finding methods to derive notions of noun group, verb group, and so on which is inherently extensible to more sophisticated annotation, and does not require a pre-tagged corpus to fit.",1,Computational Linguistics,94,A94-1011_abstr,"[other, lr, tech, tech, other, other, other, lr]","[linguistic annotation, corpora, statistical POS tagger, unsupervised structure finding methods, noun group, verb group, annotation, pre-tagged corpus]"
17,51,A pilot system has shown great effectiveness of this approach.,1,Computational Linguistics,92,C92-3165_abstr,[other],[pilot system]


In [6]:
# Select indexes of the rows that are not in the agreed_df
exclude_indexes = [i for i in range(len(annotator_1_df)) if i not in indexes]
only_in_annotator_1_df = annotator_1_df.iloc[exclude_indexes]
only_in_annotator_1_df.head(10)

Unnamed: 0,index,text,annotator_id,domain,doc_id,doc_segment_id,category,label
1,3,(iii) Rapid system development and porting to new domains via knowledge-based automated acquisition of grammars.,1,Computational Linguistics,1,H01-1041_abstr,"[tech, other, tech]","[Rapid system development, domains, knowledge-based automated acquisition of grammars]"
2,5,/Soames 1979/ provides some counterexamples to the theory of natural language presuppositions that is presented in /Gazdar 1979/.,1,Computational Linguistics,88,C88-2086_abstr,[other],[theory of natural language presuppositions]
5,15,A dialogue acquisition and tracking algorithm is presented along with a description of its implementation in a voice interactive system.,1,Computational Linguistics,86,J86-1002_abstr,"[tech, other, tech]","[dialogue acquisition and tracking algorithm, implementation, voice interactive system]"
6,18,"A further reduction in the search space is achieved by using semantic rather than syntactic categories on the terminal and non-terminal edges, thereby reducing the amount of ambiguity and thus the number of edges, since only edges with a valid semantic interpretation are ever introduced.",1,Computational Linguistics,92,A92-1027_abstr,"[other, other, other, other, other, other, other, other]","[reduction in the search space, semantic, syntactic categories, terminal and non-terminal edges, ambiguity, edges, edges, semantic]"
9,28,A method for error correction of ill-formed input is described that acquires dialogue patterns in typical usage and uses these patterns to predict new inputs.,1,Computational Linguistics,86,J86-1002_abstr,"[tech, other, other, other]","[error correction, ill-formed input, dialogue patterns, patterns]"
11,35,A new approach for Interactive Machine Translation where the author interacts during the creation or the modification of the document is proposed.,1,Computational Linguistics,88,C88-2160_abstr,"[tech, other, other]","[Interactive Machine Translation, author, document]"
14,43,"A novel evaluation scheme is proposed which accounts for the effect of polysemy on the clusters, offering us a good insight into the potential and limitations of semantically classifying undisambiguated SCF data.",1,Computational Linguistics,3,P03-1009_abstr,"[tech, other, other, tech, other]","[evaluation scheme, polysemy, clusters, semantically classifying, undisambiguated SCF data]"
16,48,A parser incorporating the control structure and the parsing strategies is currently under implementation.,1,Computational Linguistics,84,P84-1047_abstr,"[tech, other, other, other]","[parser, control structure, parsing strategies, implementation]"
20,61,A research program is described in which a particular representational format for meaning is tested as broadly as possible.,1,Computational Linguistics,78,T78-1001_abstr,[other],[representational format for meaning]
22,67,A series of tests are described that show the power of the error correction methodology when stereotypic dialogue occurs.,1,Computational Linguistics,86,J86-1002_abstr,"[tech, other]","[error correction methodology, stereotypic dialogue]"


In [7]:
annotator_1_df_only_doc_id = annotator_1_df['doc_id'].unique()
only_in_annotator_2_df = annotator_2_df[[doc_id not in annotator_1_df_only_doc_id for doc_id in annotator_2_df['doc_id']]]
only_in_annotator_2_df.head(10)

Unnamed: 0,index,text,annotator_id,domain,doc_id,doc_segment_id,category,label
0,0,"(Ramshaw and Marcus, 1995) have introduced a convenient data representation for chunking by converting it to a tagging task.",2,Computational Linguistics,99,E99-1023_abstr,"[tech, tech, tech]","[data representation, chunking, tagging task]"
4,7,A 'lexicalized' grammar naturally follows from the extended domain of locality of TAGs.,2,Computational Linguistics,89,H89-1036_abstr,"[lr, other, tech]","['lexicalized' grammar, extended domain of locality, TAGs]"
6,10,"A construction-specific approach also aids in task-specific language development by allowing a language definition that is natural in terms of the task domain to be interpreted directly without compilation into a uniform grammar formalism, thus greatly speeding the testing of changes to the language definition.",2,Computational Linguistics,81,P81-1033_abstr,"[tech, model, other, other, tech, model]","[language development, language definition, task domain, uniform grammar formalism, testing, language definition]"
8,13,"A declarative formalism is presented which permits direct mappings of one feature structure into another, and illustrative examples are given of its application to areas of current interest.",2,Computational Linguistics,91,E91-1050_abstr,"[other, other, tech]","[declarative formalism, mappings, feature structure]"
9,14,"A demonstration (in UNIX) for Applied Natural Language Processing emphasizes components put to novel technical uses in intelligent computer-assisted morphological analysis (ICALL), including disambiguated morphological analysis and lemmatized indexing for an aligned bilingual corpus of word examples.",2,Computational Linguistics,97,A97-1020_abstr,"[other, tech, tech, tech, tech, lr]","[UNIX, Applied Natural Language Processing, intelligent computer-assisted morphological analysis (ICALL), disambiguated morphological analysis, lemmatized indexing, aligned bilingual corpus of word examples]"
11,17,"A flexible parser can deal with input that deviates from its grammar, in addition to input that conforms to it.",2,Computational Linguistics,81,P81-1033_abstr,"[tech, other, lr, other]","[parser, input, grammar, input]"
13,20,A general parsing strategy for 'lexicalized' grammars is discussed.,2,Computational Linguistics,89,H89-1036_abstr,"[tech, lr]","[parsing strategy, 'lexicalized' grammars]"
15,23,A grammar of this form will be said to be 'lexicalized'.,2,Computational Linguistics,89,H89-1036_abstr,[lr],[grammar]
17,26,A lexicons for French is also being developed.,2,Computational Linguistics,89,H89-1036_abstr,"[lr, other]","[lexicons, French]"
18,27,"A major concern in corpus based approaches is that the applicability of the acquired knowledge may be limited by some feature of the corpus, in particular, the notion of text 'domain'.",2,Computational Linguistics,97,A97-1015_abstr,"[tech, other, other, lr, other]","[corpus based approaches, knowledge, feature, corpus, text 'domain']"


In [8]:
# Check the number of the rows in each dataframe
len_agreed_df = len(agreed_df)
len_only_in_annotator_1_df = len(only_in_annotator_1_df)
len_only_in_annotator_2_df = len(only_in_annotator_2_df)
len_total_df = len_agreed_df + len_only_in_annotator_1_df + len_only_in_annotator_2_df

print(f'Number of the rows in the agreed_df: {len_agreed_df}')
print(f'Number of the rows in the only_in_annotator_1_df: {len_only_in_annotator_1_df}')
print(f'Number of the rows in the only_in_annotator_2_df: {len_only_in_annotator_2_df}')

print(f'Ratio of the agreed_df: {len_agreed_df / (len_total_df)}')

Number of the rows in the agreed_df: 367
Number of the rows in the only_in_annotator_1_df: 503
Number of the rows in the only_in_annotator_2_df: 351
Ratio of the agreed_df: 0.30057330057330056


In [10]:
# Split DataFrame into training, validation, and test set
import random
seed=42
validation_dataset_ratio = 0.1

len_validation_dataset = len_total_df * validation_dataset_ratio
len_only_annotator_df = len_only_in_annotator_1_df + len_only_in_annotator_2_df

random.seed(seed)
only_in_annotator_1_df_val_idx = random.sample(only_in_annotator_1_df.index.to_list(), int(round(len_validation_dataset * len_only_in_annotator_1_df/len_only_annotator_df, 0)))
only_in_annotator_2_df_val_idx = random.sample(only_in_annotator_2_df.index.to_list(), int(round(len_validation_dataset * len_only_in_annotator_2_df/len_only_annotator_df, 0)))

test_df = agreed_df
validation_df = pd.concat([
    only_in_annotator_1_df.loc[only_in_annotator_1_df_val_idx],
    only_in_annotator_2_df.loc[only_in_annotator_2_df_val_idx]
])
train_df = pd.concat([
    only_in_annotator_1_df.drop(only_in_annotator_1_df_val_idx),
    only_in_annotator_2_df.drop(only_in_annotator_2_df_val_idx)
])

In [11]:
print(len(test_df))
print(len(validation_df))
print(len(train_df))
print(f'Ratio of validation dataset: {len(validation_df) / (len(train_df)+len(validation_df)+len(test_df))}')

367
122
732
Ratio of validation dataset: 0.09991809991809991


In [16]:
# Convert DataFrame to huggingface dataset and save it to specified directory
from datasets import Dataset, DatasetDict

hf_save_dir = 'huggingface'
os.makedirs(hf_save_dir, exist_ok=True)

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
validation_dataset = Dataset.from_pandas(validation_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

dataset.save_to_disk(hf_save_dir)


Saving the dataset (1/1 shards): 100%|██████████| 732/732 [00:00<00:00, 29407.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 122/122 [00:00<00:00, 14841.07 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 367/367 [00:00<00:00, 1622.55 examples/s]
