In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os

In [None]:
def conll_to_jsonl_chunked(input_file, output_file, chunk_size=1000):
    with open(output_file, 'w') as out_file:
        chunk = []
        sentences = []
        labels = []

        with open(input_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    if sentences and labels:
                        chunk.append({'tokens': sentences, 'ner_tags': labels})
                        sentences = []
                        labels = []

                else:
                    parts = line.split()
                    token = parts[6]
                    label = parts[-1]
                    sentences.append(token)
                    labels.append(label)

                if len(chunk) >= chunk_size:
                    for record in chunk:
                        json_record = json.dumps(record)
                        out_file.write(json_record + '\n')
                    chunk = []

        if chunk:
            for record in chunk:
                json_record = json.dumps(record)
                out_file.write(json_record + '\n')

input_json_file_path = 'drive/MyDrive/266_project/json_data/output.jsonl'
output_jsonl_file_path = 'drive/MyDrive/266_project/json_data/main_data.jsonl'

def bio_tag_to_entity(tokens, ner_tags):
    entities = {"Drug": [], "Duration": [], "Dosage": [], "Frequency": [], "Strength": [], "Form": [], "Route": [], "Reason": [], "ADE": []}
    current_entity = None
    current_type = None

    for token, tag in zip(tokens, ner_tags):
        if tag.startswith("B-"):
            if current_entity:
                entities[current_type].append(current_entity)
            current_entity = token
            current_type = tag[2:]
        elif tag.startswith("I-") and current_type == tag[2:]:
            current_entity += " " + token
        else:
            if current_entity:
                entities[current_type].append(current_entity)
                current_entity = None
            current_type = None

    if current_entity:
        entities[current_type].append(current_entity)

    return entities

def convert_format(input_json_file_path, output_jsonl_file_path):
    with open(input_json_file_path, 'r') as input_file, open(output_jsonl_file_path, 'w') as output_file:
        for line in input_file:
            record = json.loads(line)
            tokens = record['tokens']
            ner_tags = record['ner_tags']
            entities = bio_tag_to_entity(tokens, ner_tags)
            output_record = {
                "text": " ".join(tokens),
                "entities": entities
            }
            output_file.write(json.dumps(output_record, ensure_ascii=False) + '\n')

convert_format(input_json_file_path, output_jsonl_file_path)

In [None]:
output_jsonl_file_path = 'drive/MyDrive/266_project/json_data/main_data.jsonl'

In [None]:
def view_jsonl_file(jsonl_file_path, limit=500):
    with open(jsonl_file_path, 'r') as file:
        for i, line in enumerate(file):
            if i >= limit:
                break
            json_record = json.loads(line.strip())
            print(json.dumps(json_record, indent=2, ensure_ascii=False))
            print()

view_jsonl_file(output_jsonl_file_path, limit=1)

{
  "text": "Admission Date : [ * * 2202 - 1 - 8 * * ] Discharge Date : [ * * 2202 - 2 - 1 * * ]",
  "entities": {
    "Drug": [],
    "Duration": [],
    "Dosage": [],
    "Frequency": [],
    "Strength": [],
    "Form": [],
    "Route": [],
    "Reason": [],
    "ADE": []
  }
}



In [None]:
data = []

with open(output_jsonl_file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))


In [None]:
len(data)

49877