## Repeated Cross Validation for NER method extraction from assay description
The notebook manages the main pipeline for models training/testing cross validation.
* Reads the annotated dataset
* It split the training data into chunks according to the cross validations folds settings
* Generates the jsonl input files
* It executes the pipeline by training and testing accross all chunks of data

The main pipeline is contained in the folder 'ner_assays' and it's set up to accept the input files:
* assays_eval.jsonl
* assays_training.jsonl

After execution of input data it generates the following outputs:
* model-best
* model-last
* metrics.json

While executing CV, each of the chunks data inputs is copied to the pipeline folder. The pipeline is executed with those files and outputs generated are copied back out to the specified path.

For live testing of a specific model, the output files of such model can be moved to the output folder and execute the pipeline commands.

### Importing modules

In [2]:
import pandas as pd
import json, spacy, os
from sklearn.model_selection import RepeatedKFold
from pathlib import Path
import shutil
import subprocess
import altair as alt

In [None]:
#Settings for display (if needed)
pd.set_option('display.max_colwidth', None)  # Set to None to display the full column width
pd.set_option('display.max_rows', None)      # Set to None to display al

#check path to language
try:
    nlp = spacy.load("en_core_web_sm") 
except OSError:
    print("Model not found. Downloading 'en_core_web_sm'")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm") 

### Method to obtain jsonl input files

In [None]:
#Method to generate jsonl file with tabulated data for model training
def generate_jsonl(df,f): #dataframe and output file
    # Load spaCy model 
    nlp = spacy.load("en_core_web_sm")  # Adjust the model name if needed

    data = []
    for index, row in df.iterrows():
        sentence = row['description'].lower()
            
        # Tokenization with spaCy
        doc = nlp(sentence)
        token_data = []
        for i, token in enumerate(doc):
            token_data.append({"text": token.text, "start": token.idx, "end": token.idx + len(token), "id": i })

        # Entity Labeling
        if not pd.isna(row['method']):
            entity = row['method'].lower()
            start_idx = sentence.find(entity)
            end_idx = start_idx + len(entity)
            if start_idx != -1:
                token_st = [entry for entry in token_data if entry['start'] == start_idx][0]['id']
                token_en = [entry for entry in token_data if entry['end'] == end_idx][0]['id']
                spans = [{"start": start_idx, "end": end_idx, 
                        "token_start": token_st, 
                        "token_end": token_en, 
                        "label": "METHOD"}]
            else:
                spans = []
        else:
            spans=[]

        #print(sentence, "|", entity, "|", start_idx, "|", end_idx, sentence[start_idx], sentence[end_idx])

        # JSONL Entry
        entry = {
            "text": sentence,
            "meta": {},  # Add metadata if needed
            "_input_hash": hash(sentence),  # Some hash function
            "_task_hash": -1,  # Placeholder
            "tokens": token_data,
            "spans": spans,
            "answer": "accept"  # Replace if needed
        }
        data.append(entry)

    #writing jsonl file
    for item in data:
        json.dump(item,f)
        f.write('\n')

### Model training: Binding assays and functional assays

##### Main annotated dataset

In [None]:
dataset = pd.read_csv('data/assays_data.csv', sep='\t')

##### Setting up the Cross-Validation

In [None]:
mpath = "Model_cv"
rpath = "Results"
os.makedirs(mpath, exist_ok=True)
os.makedirs(rpath, exist_ok=True)

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=20172023)
for fold, (train_index, test_index) in enumerate(rkf.split(dataset)):
    train_df = dataset.iloc[train_index]
    test_df = dataset.iloc[test_index]
    path = os.path.join(mpath,'chunk{}'.format(fold))
    os.makedirs(path, exist_ok=True)

    # Write to JSONL files
    with open(os.path.join(path,'assays_eval.jsonl'), 'w') as f:
        generate_jsonl(test_df, f)# Generate JSONL file with testing data 
    with open(os.path.join(path,'assays_training.jsonl'), 'w') as f:
        generate_jsonl(train_df, f)# Generate JSONL file with testing data 

##### Executing Cross-Validation: Move inputs to pipeline, run pipeline, take outputs

In [None]:
for f in Path(mpath).iterdir():
    if f.is_dir():
        chunk = f.name
        eval = os.path.join(f,'assays_eval.jsonl')
        train = os.path.join(f,'assays_training.jsonl')

        #Copy the current input files to the pipeline path
        shutil.copy(eval, os.path.join('ner_assays/assets/'))
        shutil.copy(train, os.path.join('ner_assays/assets/'))

        #Run the pipeline
        os.chdir('ner_assays')
        command = 'python3 -m weasel run model-cv'
        subprocess.run(command, shell=True, capture_output=True, text=True)
        os.chdir('../')
        
        #Move outputs to the chunk folder
        opath = os.path.join(f,'training')
        shutil.copytree('ner_assays/training', opath, dirs_exist_ok=True)

##### Cross-Validation metrics

In [None]:
metrics_all = []

# Extract all metrics from different chunks
for f in Path(mpath).iterdir():
    if f.is_dir():
        chunk = f.name
        metricspath = os.path.join(f,'training/metrics.json')
        # Open JSON File
        with open(metricspath, 'r') as file:
            data = json.load(file)
        metrics  = {key: data[key] for key in ['ents_p', 'ents_r', 'ents_f']}
        metrics['chunk'] = chunk
        metrics_all.append(metrics)

metrics_df = pd.DataFrame(metrics_all).sort_values(by=['ents_f','ents_p', 'ents_r']).round(2).set_index('chunk') #moves chunk column as index to calculate mean of all
metrics_df.loc['mean'] = metrics_df.mean().round(2) #calculates mean of all features
metrics_df.reset_index().to_csv(os.path.join(rpath,'cv_metrics.tsv'), sep='\t', index=False) #Saves as csv file and resets normal index

In [3]:
# Plotting data for cross validation
mpath = "Model_cv"
rpath = "Results"
#Parse metrics
metrics = pd.read_csv(os.path.join(rpath,'cv_metrics.tsv'), sep='\t')
metrics = metrics[metrics.chunk != 'mean'].rename(columns={'ents_f':'F-score','ents_p':'Precision', 'ents_r': 'Recall'})
display(metrics)


# Melt the DataFrame
df_melted = metrics.melt('chunk', var_name='Metric', value_name='value')

# Compute min and max values across all metrics
min_value = df_melted['value'].min()
max_value = df_melted['value'].max()

# Define a list of green colors
green_colors = ["#9ecae1", "#045a8d", "#006d2c"]  # Light to dark green

# Plotting metrics
chart = alt.Chart(df_melted).mark_line(point=True).encode(
    x=alt.X('chunk:O', sort=None),
    y=alt.Y('value:Q', scale=alt.Scale(domain=[min_value, max_value])),
    color=alt.Color('Metric:N', scale=alt.Scale(domain=['F-score','Precision','Recall'], range=green_colors)),
    tooltip=['chunk', 'Metric', 'value']
)

# Save the chart as png
output_filename = "cross_validation_metrics.png"  # Choose your filename
output_path = os.path.join(rpath, output_filename) # Save in the same directory as metrics.csv
chart.save(output_path)

# Save the chart as interactive html
output_filename = "cross_validation_metrics.html"  # Choose your filename
output_path = os.path.join(rpath, output_filename) # Save in the same directory as metrics.csv
chart.save(output_path)


Unnamed: 0,chunk,Precision,Recall,F-score
0,chunk8,0.87,0.93,0.9
1,chunk12,0.89,0.94,0.91
2,chunk6,0.92,0.91,0.91
3,chunk23,0.89,0.94,0.92
4,chunk19,0.91,0.93,0.92
5,chunk22,0.94,0.91,0.93
6,chunk1,0.9,0.96,0.93
7,chunk17,0.94,0.92,0.93
8,chunk14,0.95,0.91,0.93
9,chunk11,0.95,0.92,0.93


In [None]:
# Separate the first column from the rest
first_column = metrics.iloc[:, 0:1] # Selects the first column
numerical_columns_df = metrics.iloc[:, 1:] # Selects all columns from the second one onwards

# Calculate the mean and standard deviation for the numerical columns
mean_row_data = numerical_columns_df.mean().round(2)
std_row_data = numerical_columns_df.std().round(2)

# Create Series for the new rows with "Mean" and "SD" in the first column
mean_series = pd.Series({'chunk': 'Mean', **mean_row_data.to_dict()})
std_series = pd.Series({'chunk': 'SD', **std_row_data.to_dict()})

# Convert these Series to DataFrames (as single rows)
mean_df_to_add = pd.DataFrame([mean_series])
std_df_to_add = pd.DataFrame([std_series])

# Concatenate the original DataFrame with the new mean and std dev rows
df_final = pd.concat([metrics, mean_df_to_add, std_df_to_add], ignore_index=True)

print("DataFrame with Mean and SD rows (ignoring first column, 'Mean'/'SD' in first column):")
print(df_final)
df_final.to_csv(os.path.join(rpath,'cv_metrics.tsv'), sep='\t', index=False)

DataFrame with Mean and SD rows (ignoring first column, 'Mean'/'SD' in first column):
      chunk  Precision  Recall  F-score
0    chunk8       0.87    0.93     0.90
1   chunk12       0.89    0.94     0.91
2    chunk6       0.92    0.91     0.91
3   chunk23       0.89    0.94     0.92
4   chunk19       0.91    0.93     0.92
5   chunk22       0.94    0.91     0.93
6    chunk1       0.90    0.96     0.93
7   chunk17       0.94    0.92     0.93
8   chunk14       0.95    0.91     0.93
9   chunk11       0.95    0.92     0.93
10   chunk0       0.94    0.94     0.94
11  chunk13       0.91    0.97     0.94
12   chunk2       0.92    0.96     0.94
13  chunk24       0.95    0.93     0.94
14   chunk3       0.93    0.96     0.95
15  chunk21       0.94    0.95     0.95
16   chunk4       0.97    0.93     0.95
17   chunk7       0.95    0.95     0.95
18   chunk9       0.93    0.97     0.95
19  chunk15       0.93    0.97     0.95
20   chunk5       0.93    0.98     0.95
21  chunk16       0.95    0.96    