In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict

def parse_sentences(file_path):
    sentences = []
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into sentence ID and sentence
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                sentence_id, sentence = parts
                sentences.append((sentence_id, sentence))
    return sentences

def parse_gene_eval(file_path):
    terms = []
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into sentence ID, term position, and term
            parts = line.strip().split('|')
            if len(parts) == 3:
                sentence_id, position, term = parts
                start_pos, end_pos = map(int, position.split())
                terms.append((sentence_id, [start_pos, end_pos], term))
    return terms

def parse_altgene_eval(file_path):
    alt_terms = []
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into sentence ID, term position, and term
            parts = line.strip().split('|')
            if len(parts) == 3:
                sentence_id, position, term = parts
                start_pos, end_pos = map(int, position.split())
                alt_terms.append((sentence_id, [start_pos, end_pos], term))
    return alt_terms

# Example usage
sentences_path = 'original-data/test/test/test.in'  # Replace with the path to your file
gene_eval_path = 'original-data/test/test/GENE.eval'  # Replace with the path to your file
altgene_eval_path = 'original-data/test/test/ALTGENE.eval'  # Replace with the path to your file

# Parse the sentences, terms, and alternative terms
parsed_sentences = parse_sentences(sentences_path)
parsed_terms = parse_gene_eval(gene_eval_path)
parsed_alt_terms = parse_altgene_eval(altgene_eval_path)

# Create DataFrames
df = pd.DataFrame(parsed_sentences, columns=['Sentence ID', 'Sentence'])
terms_df = pd.DataFrame(parsed_terms, columns=['Sentence ID', 'Position', 'Term'])
alt_terms_df = pd.DataFrame(parsed_alt_terms, columns=['Sentence ID', 'Alt Position', 'Alt Term'])

# Merge the DataFrames
merged_df = df.merge(terms_df, on='Sentence ID', how='left')

# Group by Sentence ID and aggregate terms and positions into lists
merged_df = merged_df.groupby('Sentence ID').agg({
    'Sentence': 'first',
    'Position': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else [[-1]],
    'Term': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else ['No term'],
}).reset_index()

merged_df = merged_df.merge(alt_terms_df, on='Sentence ID', how='left')

# Group by Sentence ID and aggregate terms and positions into lists
grouped_test_df = merged_df.groupby('Sentence ID').agg({
    'Sentence': 'first',
    'Position': 'first',
    'Term': 'first',
    'Alt Position': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else [[-1]],
    'Alt Term': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else ['No term']
}).reset_index()

grouped_test_df.rename(columns={'Sentence':'text', 'Position': 'position', 'Term': 'label', 'Alt Position': 'alt_position', 'Alt Term': 'alt_label'}, inplace=True)
grouped_test_df.drop('Sentence ID', axis=1, inplace=True)
# Display the grouped DataFrame
print(grouped_test_df)

                                                   text  \
0     Phenotypic analysis demonstrates that trio and...   
1     In the cervical enlargement they were located ...   
2     We measured electromyograms (EMG) of the alae ...   
3     A case control study (1:2) of 182 pairs of Hep...   
4     In the absence of shock, sepsis, or other iden...   
...                                                 ...   
4995  CONCLUSIONS: This initial experience indicates...   
4996  A Brassica cDNA clone encoding a bifunctional ...   
4997  The aims of this study were to examine whether...   
4998  The goal of our work was to determine hearing ...   
4999  Causal modeling combines theory and research, ...   

                   position  \
0      [[34, 37], [41, 43]]   
1                    [[-1]]   
2                    [[-1]]   
3                    [[-1]]   
4                [[85, 94]]   
...                     ...   
4995                 [[-1]]   
4996  [[39, 67], [69, 102]]   
4997        

In [None]:
# Example usage
sentences_path = 'original-data/train/train/train.in'  # Replace with the path to your file
gene_eval_path = 'original-data/train/train/GENE.eval'  # Replace with the path to your file
altgene_eval_path = 'original-data/train/train/ALTGENE.eval'  # Replace with the path to your file

# Parse the sentences, terms, and alternative terms
parsed_sentences = parse_sentences(sentences_path)
parsed_terms = parse_gene_eval(gene_eval_path)
parsed_alt_terms = parse_altgene_eval(altgene_eval_path)

# Create DataFrames
df = pd.DataFrame(parsed_sentences, columns=['Sentence ID', 'Sentence'])
terms_df = pd.DataFrame(parsed_terms, columns=['Sentence ID', 'Position', 'Term'])
alt_terms_df = pd.DataFrame(parsed_alt_terms, columns=['Sentence ID', 'Alt Position', 'Alt Term'])

# Merge the DataFrames
merged_df = df.merge(terms_df, on='Sentence ID', how='left')

# Group by Sentence ID and aggregate terms and positions into lists
merged_df = merged_df.groupby('Sentence ID').agg({
    'Sentence': 'first',
    'Position': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else [[-1]],
    'Term': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else ['No term'],
}).reset_index()

merged_df = merged_df.merge(alt_terms_df, on='Sentence ID', how='left')

# Group by Sentence ID and aggregate terms and positions into lists
grouped_train_df = merged_df.groupby('Sentence ID').agg({
    'Sentence': 'first',
    'Position': 'first',
    'Term': 'first',
    'Alt Position': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else [[-1]],
    'Alt Term': lambda x: list(x.dropna()) if len(x.dropna()) > 0 else ['No term']
}).reset_index()

grouped_train_df.rename(columns={'Sentence':'text', 'Position': 'position', 'Term': 'label', 'Alt Position': 'alt_position', 'Alt Term': 'alt_label'}, inplace=True)
grouped_train_df.drop('Sentence ID', axis=1, inplace=True)

# Display the grouped DataFrame
print(grouped_train_df)

In [5]:
# Add domain column
grouped_train_df['domain'] = 'Gene'
grouped_test_df['domain'] = 'Gene'

grouped_train_df['category'] = 'unknown'
grouped_test_df['category'] = 'unknown'

In [6]:
# Find the row with the longest sentence
max_length = grouped_train_df['text'].apply(len).max()
print('Max sentence length:', max_length)

Max sentence length: 974


In [7]:
grouped_train_df.iloc[10]

text            When the CO2 content reached 9 Vol% the animal...
position                                                   [[-1]]
label                                                   [No term]
alt_position                                               [[-1]]
alt_label                                               [No term]
domain                                                       Gene
category                                                  unknown
Name: 10, dtype: object

In [8]:
import os
from datasets import Dataset, DatasetDict

# Convert the grouped DataFrames to datasets
train_dataset = Dataset.from_pandas(grouped_train_df)
test_dataset = Dataset.from_pandas(grouped_test_df)

# Split the train dataset into train and validation sets
train_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)
validation_dataset = train_dataset['test']
train_dataset = train_dataset['train']

# Save the datasets
save_dir = 'huggingface/short'
os.makedirs(save_dir, exist_ok=True)

dataset_dict = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})


In [9]:
new_label = []
for l, al in zip(test_dataset['label'], test_dataset['alt_label']):
    new_label.append(l+al)

In [10]:
print(test_dataset['alt_position'][6])
print(test_dataset['alt_label'][6])
print(test_dataset['position'][6])
print(test_dataset['text'][6])

[[3, 21]]
['COR biosynthetic gene']
[[3, 5], [138, 140]]
The COR biosynthetic gene cluster in P. syringae pv. glycinea PG4180 is encoded by a 32-kb region which contains both the structural and regulatory genes needed for COR synthesis.


In [11]:
total_new_ref_tags = []
for i, [pos, alt_pos] in enumerate(zip(test_dataset['position'], test_dataset['alt_position'])):
    new_ref_tags = []
    if pos[0] == -1 and alt_pos[0] == -1:
        new_ref_tags.append(['No term'])
    else:
        for j, p in enumerate(pos):
            label = test_dataset['label'][i][j]
            
            alt_tags = [label]
                
            for k, ap in enumerate(alt_pos):
                if ap[0] == -1:
                    continue
                elif p[0] == -1:
                    alt_label = test_dataset['alt_label'][i][k]
                    alt_tags.append(alt_label)
                elif max(p[0], ap[0]) < min(p[1], ap[1]):
                    alt_label = test_dataset['alt_label'][i][k]
                    alt_tags.append(alt_label)
            new_ref_tags.append(alt_tags)
    total_new_ref_tags.append(new_ref_tags)

[['No term']]

In [12]:
for idx in range(10,20):
    print('THis is idx:', idx)
    print(test_dataset['alt_position'][idx])
    print('This is alt label:', test_dataset['alt_label'][idx])
    print(test_dataset['position'][idx])
    print('This is label:', test_dataset['label'][idx])
    print(test_dataset['text'][idx])
    print(total_new_ref_tags[idx])
    print()

THis is idx: 10
[[-1]]
This is alt label: ['No term']
[[19, 22], [30, 34], [71, 74]]
This is label: ['MAPK', 'Raf-1', 'Jak1']
Interestingly, basal MAPK, but not Raf-1, activity was constitutively enhanced in Jak1-deficient HeLa cells.
[['MAPK'], ['Raf-1'], ['Jak1']]

THis is idx: 11
[[-1]]
This is alt label: ['No term']
[[-1]]
This is label: ['No term']
On the other hand hypokalemia, induced by diuretics, may also be accompanied by a significant depletion of total body K, bringing about more general consequences.
[['No term']]

THis is idx: 12
[[47, 51]]
This is alt label: ['gp330']
[[42, 51]]
This is label: ['human gp330']
We present here the complete primary structure of human gp330, the human variant of the principal kidney autoantigen causing Heymann membranous glomerulonephritis in rats.
[['human gp330', 'gp330']]

THis is idx: 13
[[-1]]
This is alt label: ['No term']
[[-1]]
This is label: ['No term']
A series of deletion mutants was expressed transiently in two human hepatocytes,

In [13]:
test_dataset

Dataset({
    features: ['text', 'position', 'label', 'alt_position', 'alt_label', 'domain'],
    num_rows: 5000
})

In [14]:
# Add total_new_ref_tags to the test_dataset
test_dataset = test_dataset.add_column('relaxed_labels', total_new_ref_tags)
test_dataset

Dataset({
    features: ['text', 'position', 'label', 'alt_position', 'alt_label', 'domain', 'relaxed_labels'],
    num_rows: 5000
})

In [16]:
dataset_dict = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})
dataset_dict.save_to_disk(save_dir)

Saving the dataset (1/1 shards): 100%|██████████| 12000/12000 [00:00<00:00, 53348.22 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 67503.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5000/5000 [00:00<00:00, 85160.77 examples/s]
