# **LoupeBrowser annotated output parser**
> **Author:** Cevi Bainton\
> **Date:** 3/22/2024

This notebook is forked from `loupebrowser_parser.ipynb` made by Kacper Maciejewski for the `WSI-ST_framework` project.

### Load libraries and specify settings

In [44]:
import os
import pandas as pd

In [45]:
# Specify a directory path to save parsed CSVs
SAVE_DIR_PATH = "intermediate_data/classification/cleaned_classification_wenwen"

# Specify the category name of exported labels
# if 'Graph-based' (as in first version of Malak annotations), script will automatically
# extract ST_cluster (based on "Cluster_" before every cluster label) and so it's ready for existing data
# CATEGORY = "Graph-based" # use for malak data
CATEGORY = 'Wenwen annotations' # use for wenwen data

# Specify clinical clusters with their labels
CLINICAL_LABELS = {
    0: "normal",
    1: "cancer"
    }

In [46]:
# INPUT_PATH = R'..\original_data\Pathologist_annotations_LoupeBrowser\malak_exported_annotation_csvs'
INPUT_PATH = R'..\original_data\Pathologist_annotations_LoupeBrowser\wenwen_exported_annotation_csvs'
CSV_INPUTS = {}

if CATEGORY == 'Graph-based':
    for lb_path in os.listdir(INPUT_PATH):
        # Extract slide_id in form ##X [number-number-letter]
        short_name = lb_path[1:3] + lb_path[4]
        # Make dict from short_name to path
        CSV_INPUTS[short_name] = os.path.join(INPUT_PATH, lb_path)
else:
    for lb_path in os.listdir(INPUT_PATH):
        # Extract slide_id in form ##X [number-number-letter]
        slide_num = lb_path.split(sep='-')[1][1:3]
        slide_letter = lb_path.split(sep='-')[1][4]
        short_name = slide_num + slide_letter
        # Make dict from short_name to path
        CSV_INPUTS[short_name] = os.path.join(INPUT_PATH, lb_path)


# Specify file paths of LoupeBrowser annotation outputs to parse with their names 
LB_PATH = CSV_INPUTS

In [57]:
CLASSIFIER_PATH = "label_clean_classifiers.csv"


# Check for predone annotations
if os.path.exists(CLASSIFIER_PATH):
    # Preset with those annotations if already done
    starting_classifier = pd.read_csv(CLASSIFIER_PATH)
    starting_classifier.loc[:,"annotations"] = starting_classifier.loc[:,"annotations"].astype('str')
    starting_classifier = starting_classifier.filter(["annotations", "new_label", "mapping"], axis=1)
    all_classifications = starting_classifier.loc[:,'annotations'].tolist()
else: 
    all_classifications = []
# Loop through each file to get all unique annotations
for file, path in LB_PATH.items():
    # Read in annotation
    annotations = pd.read_csv(path)
    if CATEGORY == "Graph-based":
        # Get cluster if applicable
        short_list = annotations[CATEGORY].str.replace(R'Cluster \d', '', regex=True).unique()
    else:
        # Get second column if not malak graph based annotations
        short_list = annotations.iloc[:,1].astype("str")
    # Make lower
    short_list  = [x.lower() for x in short_list]
    # Check if label in in classifications
    for label in short_list:
        if label not in all_classifications:
            if label == 'out':
                print(path)
            all_classifications.append(label)


# Generate a csv to input the new classifications
# Careful! Checks for the path, not for every
if not os.path.exists(CLASSIFIER_PATH):
    preclass_translator = pd.DataFrame({'annotations' : all_classifications})
    preclass_translator.to_csv(f"PREMADE_{CLASSIFIER_PATH}")

# Are your mappings incomplete
incomplete_mapping = False

for label in all_classifications:
    # Add each not seen label to end of df
    if not label in starting_classifier.loc[:,'annotations'].unique() and not pd.isna(label):
        incomplete_mapping = True
        starting_classifier = pd.concat([starting_classifier, 
                                            pd.DataFrame([[label, "NOTSET", -2]], 
                                                        columns=starting_classifier.columns)],
                                                        axis=0)
        
if incomplete_mapping:
    starting_classifier.to_csv(f"INCOMPLETE_{CLASSIFIER_PATH}")
    

Add your own translations in `PREMADE_label_clean_classfiers.csv` and save premade version as `label_clean_classifiers.csv`. 
Columns will be:
0. Pandas rownames, no header
1. `annotations`: old annotation; string no quotes
2. `new_label` : new annotation [`normal`, `cancer`, `DCIS`]; string no quotes
3. `mapping` : number of classification [`normal` --> 0, `cancer` --> 1, `DCIS` --> 2]; int

_Any other cell will be treated as an `NA` later on._

My assumptions:
* Anything with question mark or a mix is more severe version
* Calcification is normal

My annotations:

0. Normal
1. Cancer
2. DCIS

For the analysis below, DCIS and Cancer will be grouped

In [58]:
class_translator = pd.read_csv("label_clean_classifiers.csv")
assert not 'NOTSET' in class_translator.loc[:,"new_label"] 

class_translator_dict = {}
class_mapper_dict = {}

for row_num in class_translator.index:
    # class_translator_dict
    row = class_translator.loc[row_num]
    class_translator_dict[row["annotations"]] = row["new_label"]
    if not row["new_label"] in class_mapper_dict.keys():
        class_mapper_dict[row["new_label"]]= int(row["mapping"])


NO_USER_INPUT = True

~~NOTE: _The following annotations don't seem to work: 70c, 74a, 74b, 84b, 86a._~~

### Parse and save all the files

In [59]:
# Iterate over files to parse
for file_name, path in LB_PATH.items():

    # Read the file into a dataframe
    print(f"Parsing {file_name}...")
    file = pd.read_csv(path)

    # Split cluster strings into their numbers and names
    if CATEGORY == "Graph-based":
        file[CATEGORY] = file[CATEGORY].str.replace(r'^Cluster\s+', '', regex=True)
        file["ST_cluster"] = file[CATEGORY].str.extract(r'(\d+)')
        file["ST_label"] = file[CATEGORY].str.replace(r'^\d+', '', regex=True)
    else:
        file["ST_cluster"] = -1
        file["ST_label"] = file[CATEGORY].str.replace(r'^\d+', '', regex=True)
    # file["ST_cluster"] = pd.to_numeric(file['ST_cluster'], downcast="integer")
    file["ST_cluster"] = file["ST_cluster"].map((lambda x : -1 if type(x) == float else x)) # this removes Nans. Be careful!
    file["ST_cluster"]= file["ST_cluster"].map((lambda x : int(x) if type(x) == str else x))
    file.drop(columns=[CATEGORY], inplace=True)

    # Make everything lowercase
    file["ST_label"] = file["ST_label"].str.lower()

    # Iterate over all labels and correct their names
    labels = file['ST_label'].unique()
    print(f"Labels found: {labels}")
    for n in range(len(labels)):
        if NO_USER_INPUT:
            if labels[n] == "":
                print("somethings up")
                new_label = " "
            else:
                new_label = class_translator_dict[labels[n]]
        else:
            new_label = input(f"Rename '{labels[n]}'")
        if new_label:
            file["ST_label"] = file["ST_label"].replace(labels[n], new_label)
            labels[n] = new_label

    # Iterate over new labels and classify them into clinical categories
    labels = list(set(labels))
    print(f"Labels to classify: {labels} with classifiers: {CLINICAL_LABELS}")
    if NO_USER_INPUT:
        mapping = class_mapper_dict
    else:
        mapping = {}
        for label in labels:
            while True:
                clinical = input(f"Classify clinically '{label}' with numeric label")
                try:
                    if int(clinical) in list(CLINICAL_LABELS):
                        mapping[label] = int(clinical)
                        break
                except ValueError:
                    print("Enter numerical value!")

    # Map user input into new columns
    file['clinical_cluster'] = pd.to_numeric(file['ST_label'].map(mapping))
    file['clinical_label'] = file['clinical_cluster'].map(CLINICAL_LABELS)

    print(len(file.columns))
    # Save parsed document
    file.to_csv(os.path.join(SAVE_DIR_PATH, f"{file_name}.csv"), index=False, header=False)

Parsing 33A...
Labels found: ['stroma' 'fat' nan 'benign breast']
Labels to classify: ['normal', 'no_annotation', ' normal'] with classifiers: {0: 'normal', 1: 'cancer'}
5
Parsing 33B...
Labels found: ['fat' nan 'stroma' 'benign breast']
Labels to classify: ['normal', 'no_annotation', ' normal'] with classifiers: {0: 'normal', 1: 'cancer'}
5
Parsing 33C...
Labels found: ['tumor' 'fat' nan 'immune cells']
Labels to classify: ['normal', 'no_annotation', 'cancer'] with classifiers: {0: 'normal', 1: 'cancer'}
5
Parsing 33D...
Labels found: ['tumor' nan 'fat' 'stroma' 'immune cells']
Labels to classify: ['no_annotation', 'normal', 'cancer'] with classifiers: {0: 'normal', 1: 'cancer'}
5
Parsing 34A...
Labels found: [nan 'benign breast' 'stroma' 'out' 'fat']
Labels to classify: ['no_annotation', ' normal', 'normal', 'unclear'] with classifiers: {0: 'normal', 1: 'cancer'}
5
Parsing 34B...
Labels found: ['stroma' 'benign breast' nan 'out' 'fat']
Labels to classify: ['normal', ' normal', 'uncle