In [3]:
import pandas as pd
from os import chdir

chdir('/Users/cbainton/Desktop/ST_project')

patient_samples_path = "original_data\ST-samples_info_for_image_analysis.csv"

In [4]:

patient_samples_df = pd.read_csv(patient_samples_path)

Let's clean up our column names, and only include the columns that aren't empty.
Each slide has associated information:
* `biopsy_sample_id` : the identification used to refer to the patient, the annotated identity of the biopsy, and its technical replicate.
* `slide_id` : The ID of the slide describing the library that was used. Four biopsies were done on each slide, so they each have id `LIBRARY-SLIDENUMBER-X` where `X` is the subquadrant of the slide. This is sometimes referred to as "sample id" for convenience in this analysis, though it is only one way to identify the sample.
* `expected_classification` : The expected identity of the tissue when taken with the biopsy.
* `annotated_classification` : The identity of the tissue identified by a pathologist. While each slide has spot level annotations, this is the overall identity the tissue seemed to have (eg `Tumor` means _clearly contains_ tumor, not that it is a homogenous tumor). Note that not all slides have high confidence.

In [5]:
patient_samples_df = patient_samples_df.drop(["#", "H&E image ID"], axis=1) # empty / redundant columns
# Rename columns as outlined above
patient_samples_df = patient_samples_df.rename(columns={
    "Sample/patient ID " : "biopsy_sample_id",
    "Library ID " : "slide_id",
    "Tissue type (control/healthy or tumor/cancer )" : "expected_classification",
    "Pathologist(Malak) annotation of the tissue section" : "annotated_classification"
    })
patient_samples_df = patient_samples_df.loc[patient_samples_df['biopsy_sample_id'].str.match("^S\d", na=False),] # Just rows with sample entries
patient_samples_df.loc[:,['biopsy_sample_id', 'slide_id']]

Unnamed: 0,biopsy_sample_id,slide_id
0,S8-C-1,V10F03-033-A
1,S8-C-2,V10F03-033-B
2,S8-T-1,V10F03-033-C
3,S8-T-2,V10F03-033-D
4,S7-C-1,V10F03-034-A
5,S7-C-2,V10F03-034-B
6,S7-T-1,V10F03-034-C
7,S7-T-2,V10F03-034-D
8,S61-C-1,V10F03-035-A
9,S61-C-2,V10F03-035-B


We can see that `S2-T-2` has not associated slide, and thus we will discard it.

In [6]:
patient_samples_df = patient_samples_df.loc[ patient_samples_df['slide_id'].notna(), ]

Unfortunately, in the rest of the analysis, the slide ids take the form `V####-0##_X` and not `V####-0##-X`. We will convert the labels below to avoid this confusion.

In [7]:
# Change - --> _
patient_samples_df['slide_id'] = patient_samples_df['slide_id'].str.replace(pat = "(?P<one>0\d\d)-",
                                           repl = lambda x : x.group("one") + "_",
                                           regex = True)
# Clean out whitespace in ids
patient_samples_df['slide_id'] = patient_samples_df['slide_id'].str.replace(pat = "\s", repl = "", regex = True)
patient_samples_df['biopsy_sample_id'] = patient_samples_df['biopsy_sample_id'].str.replace(pat = "\s", repl = "", regex = True)

Finally, we will need to convert the `annotated_classification`. This as we can see, there are varying degrees of confidence. We shall map them as seen in the dictionary below.


In [8]:
def simpler_annotations(ann_status):
     if ann_status in ["Normal", "Tumor"]:
          return ann_status
     elif ann_status == "contains tumors":
          return "Tumor"
     elif type(ann_status) == float or ann_status == "image unclear, the pathologist couldn't annotate any part":
          # This captures our NaN
          return "Unclear"
     else:
          return "Poor annotations"


patient_samples_df['annotated_classification'] = patient_samples_df['annotated_classification'].map(simpler_annotations)

Combine in with other data paths below.

In [26]:
basic_config = pd.read_csv("classify/main_config_old.csv")
# Has just sample_id,spaceranger_path,fullres_path,true_annotation_path
basic_config = basic_config.drop("patient", axis=1)
to_update = ["biopsy_sample_id", "annotated_classification", "slide_id", "biopsy_sample_id"]
for col in to_update:
    if col in basic_config.columns:
        basic_config = basic_config.drop(col, axis =1)
# basic_config = basic_config.drop(["biopsy_sample_id", axis=1)
print(sum([x in basic_config['sample_id'].to_list() for x in patient_samples_df['slide_id']]) / len(patient_samples_df['slide_id']))

full_config = basic_config.merge(right = patient_samples_df, 
                   left_on='sample_id',
                   right_on='slide_id',
                   
                   how='outer')
full_config.loc[:, 'simple_slide'] = full_config.loc[:, 'slide_id'].map(lambda slide: slide[8:10] + slide[11])
full_config.loc[:, 'simple_biopsy'] = full_config.loc[:, 'biopsy_sample_id'].map(lambda biopsy: ''.join(biopsy.split("-")))
full_config.to_csv('main_config.csv')
# TODO: remove sample_id nomenclature from scripts and remove column

1.0
