## NER model for method extraction in assay description
The notebook manages the main pipeline for model training in 100% data

In [1]:
import pandas as pd
import json, spacy, os
from sklearn.model_selection import RepeatedKFold
from pathlib import Path
import shutil
import subprocess
import altair as alt

In [5]:
#Method to generate jsonl file with tabulated data for model training
def generate_jsonl(df,f): #dataframe and output file
    # Load spaCy model 
    nlp = spacy.load("en_core_web_sm")  # Adjust the model name if needed

    data = []
    for index, row in df.iterrows():
        sentence = row['description'].lower()
            
        # Tokenization with spaCy
        doc = nlp(sentence)
        token_data = []
        for i, token in enumerate(doc):
            token_data.append({"text": token.text, "start": token.idx, "end": token.idx + len(token), "id": i })

        # Entity Labeling
        if not pd.isna(row['method']):
            entity = row['method'].lower()
            start_idx = sentence.find(entity)
            end_idx = start_idx + len(entity)
            if start_idx != -1:
                token_st = [entry for entry in token_data if entry['start'] == start_idx][0]['id']
                token_en = [entry for entry in token_data if entry['end'] == end_idx][0]['id']
                spans = [{"start": start_idx, "end": end_idx, 
                        "token_start": token_st, 
                        "token_end": token_en, 
                        "label": "METHOD"}]
            else:
                spans = []
        else:
            spans=[]

        #print(sentence, "|", entity, "|", start_idx, "|", end_idx, sentence[start_idx], sentence[end_idx])

        # JSONL Entry
        entry = {
            "text": sentence,
            "meta": {},  # Add metadata if needed
            "_input_hash": hash(sentence),  # Some hash function
            "_task_hash": -1,  # Placeholder
            "tokens": token_data,
            "spans": spans,
            "answer": "accept"  # Replace if needed
        }
        data.append(entry)

    #writing jsonl file
    for item in data:
        json.dump(item,f)
        f.write('\n')

In [6]:
dataset = pd.read_csv('data/assays_data.csv', sep='\t')

In [7]:
mpath = "Model"
os.makedirs(mpath, exist_ok=True)

In [10]:
# Write to JSONL files
with open(os.path.join(mpath,'assays_train.jsonl'), 'w') as f:
    generate_jsonl(dataset, f)# Generate JSONL file with testing data 

In [15]:
#set up path for training data
train = os.path.join(mpath,'assays_training.jsonl')

#Copy the current input files to the pipeline path
shutil.copy(train, os.path.join('ner_assays/assets/'))

#Run the pipeline
os.chdir('ner_assays')
command = 'python3 -m weasel run model-final'
subprocess.run(command, shell=True, capture_output=False, text=True) #set capture_output to True to hide execution outputs
os.chdir('../')

#Move outputs to the main model folder
opath = os.path.join(mpath,'training')
shutil.copytree('ner_assays/training', opath, dirs_exist_ok=True)

[38;5;4mℹ Running workflow 'model-final'[0m
[1m
Running command: /Users/adasme/miniconda3/envs/nerenv/bin/python3 -m spacy download en_core_web_md
Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[1m
[38;5;4mℹ Skipping 'preprocess-final': nothing changed[0m
[1m
[38;5;4mℹ Skipping 'train-final': nothing changed[0m
[1m
[38;5;4mℹ Skipping 'package': nothing changed[0m


'Model/training'