# Pathology Report Segmentation 
---


### Introduction

The primary goal of this notebook is to break down pathology reports into sections and sub-sections. These consist of:

- Report header
- Specimens submitted
- Clinical diagnosis
- Pathological diagnosis
    - Synoptic infomation
    - Free-text
- Addendums

#### Annotations
Once reports are broken down into their general subsections, specific attributes of the report are captured. These include:
- Date of procedure (DOP)
- Source accession number (Ex. specimen in current report was originally examined in a previous report)
- DOP derived from combination of pathology reports, surgeries, and IR reports (to backfill DOP above) 

### Import libraries

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pathology_report_segmentation.segmentation import InitCleanPathology
from pathology_report_segmentation.segmentation import ParseSurgicalPathology
from pathology_report_segmentation.segmentation import ParseMolecularPathology
from pathology_report_segmentation.segmentation import PathologyParseSpecSubmitted
from pathology_report_segmentation.segmentation import ParseSurgicalPathologySpecimens
from pathology_report_segmentation.annotations import PathologyExtractAccession
from pathology_report_segmentation.annotations import PathologyExtractDOP
from pathology_report_segmentation.annotations import CombineAccessionDOPImpact
from pathology_report_segmentation.annotations import PathologyImpactDOPAnno

from msk_cdm.data_classes.legacy import CDMProcessingVariables as c_dar
from msk_cdm.data_processing import set_debug_console


In [None]:
# Extend debug window
set_debug_console()

-----------------------
## Create MSK-IMPACT pathology annotation summary table

### Filenames

In [None]:
# Minio env file
fname_minio_env = '/Users/cfong2/Documents/github/cdm/msk_cdm/minio_env.txt'

# Non-pathology report files
fname_surgeries = c_dar.fname_surg
fname_ir = c_dar.fname_ir

# Pathology report extraction files
fname_path_idb = c_dar.fname_pathology
fname_path_clean = c_dar.fname_path_clean
fname_path_surgical_reports_parsed = c_dar.fname_darwin_path_surgical
fname_molecular_pathology_reports = c_dar.fname_darwin_path_molecular
fname_path_surgical_reports_by_parts = c_dar.fname_darwin_path_col_spec_sub
fname_out_pathology_specimens_parsed = c_dar.fname_darwin_path_clean_parsed_specimen

# Annotation files
fname_path_accession = c_dar.fname_path_accessions
fname_path_dop = c_dar.fname_spec_part_dop
fname_path_summary_sparse = c_dar.fname_combine_dop_accession
fname_path_summary_final = c_dar.fname_dop_anno


#### Logic for Running Modules

In [None]:
# Segmentation modules
run_path_clean = False
run_parse_surg = False
run_parse_dmp = False
run_parse_spec_sub = False
run_parse_path_dx = False

# Annotation modules
run_parse_accession = False
run_parse_dop = False
run_CombineAccessionDOPImpact = False
run_dop_extra_anno = True

#### Column names

In [None]:
## Constants
col_label_access_num = 'ACCESSION_NUMBER'
col_label_spec_num = 'SPECIMEN_NUMBER'
col_spec_sub = 'SPECIMEN_SUBMITTED'
col_spec_sub_list='SPECIMEN_SUBMISSION_LIST'
list_cols_id=['MRN', 'ACCESSION_NUMBER']



## Run Code for header segmentation
**Run initial cleaning of pathology table that comes directly from IDB query**


### InitCleanPathology
**Initial clean and data standardization**

In [None]:
# Recreate cleaned pathology data
if run_path_clean:
    print('Running InitCleanPathology...')
    obj_path = InitCleanPathology(
        fname_minio_env=fname_minio_env,
        fname=fname_path_idb,  # Change name
        fname_save=fname_path_clean
    )

In [None]:
if run_path_clean:
    df_obj_path = obj_path.return_df().head()
    df_obj_path.head();

### ParseSurgicalPathology
**Parse main sections of surgical pathology reports**

In [None]:
# Using the cleaned pathology table, parse the main sections of the surgical pathology note
# surgical_pathology_parsing.py
if run_parse_surg:
    print('Running ParseSurgicalPathology...')
    obj_path_parse = ParseSurgicalPathology(
        fname_minio_env=fname_minio_env,
        fname_path_clean=fname_path_clean,
        fname_save=fname_path_surgical_reports_parsed
    )

In [None]:
if run_parse_surg:
    df_obj_path_parse = obj_path_parse.return_df_summary()
    df_obj_path_parse.head();



### ParseMolecularPathology
**Parse main sections of molecular pathology (MSK-IMPACT) reports**


In [None]:
# Using the cleaned pathology table, parse the main sections of the molecular pathology note
if run_parse_dmp:
    print('Running ParseMolecularPathology...')
    obj_parse_dmp = ParseMolecularPathology(
        fname_minio_env=fname_minio_env, 
        fname_path_clean=fname_path_clean,
        fname_save=fname_molecular_pathology_reports
    )

In [None]:
if run_parse_dmp:
    df_obj_parse_dmp = obj_parse_dmp.return_df_summary()
    df_obj_parse_dmp.head();


### PathologyParseSpecSubmitted
**Parses specimen submitted column into individual parts**


In [None]:
if run_parse_spec_sub:
    print('Running PathologyParseSpecSubmitted...')
    obj_spec_sub = PathologyParseSpecSubmitted(
        fname_minio_env=fname_minio_env,
        fname_path_parsed=fname_path_clean,
        col_spec_sub=col_spec_sub_list,
        list_cols_id=list_cols_id,
        fname_save=fname_path_surgical_reports_by_parts
    )

In [None]:
if run_parse_spec_sub:
    df_spec_sub = obj_spec_sub.return_df()
    df_spec_sub.head(10);

### ParseSurgicalPathologySpecimens
**Parses specimen submitted SECTION in surgical path reports into individual parts**


In [None]:
if run_parse_path_dx:
    print('Running ParseSurgicalPathologySpecimens...')
    obj_parse = ParseSurgicalPathologySpecimens(
        fname_minio_env=fname_minio_env, 
        fname_darwin_pathology_parsed=fname_darwin_pathology_parsed,
        fname_save=fname_out_pathology_specimens_parsed
    )

In [None]:
if run_parse_path_dx:
    df_surg_path_parsed_spec = obj_parse.return_df_parsed_spec()
    df_surg_path_parsed_spec.head();



## Extract Annotations from Segmented Pathology report 

### PathologyExtractAccession
**Automatically abstracts source accession number for specimen submitted parts**
- Source file: `pathology_extract_accession.py`
- Input: Dataframe of specimens submitted. Each row is a part number of a report (Typically: `table_pathology_col_spec_sub.csv`)

- Output: Dataframe of accession and part number embedded in free-text (Typically: `path_accessions.csv`)

In [None]:
if run_parse_accession:
    print('Running PathologyExtractAccession...')
    obj_ext_accession = PathologyExtractAccession(
        fname_minio_env=fname_minio_env,
        fname=fname_path_surgical_reports_by_parts,
        col_label_access_num=col_label_access_num,
        col_label_spec_num=col_label_spec_num,
        col_spec_sub=col_spec_sub,
        fname_out=fname_path_accession
    )

In [None]:
if run_parse_accession:
    df_accessions = obj_ext_accession.return_df()
    df_accessions[df_accessions['SOURCE_ACCESSION_NUMBER_0'].notnull()].head();

### PathologyExtractDOP
**Extraction of DOP of the specimen part from specimen submitted sections**
- Source file: pathology_extract_dop.py
- Input: Dataframe of specimens submitted. Each row is a part number of a report (Typically: `table_pathology_col_spec_sub.csv`)

- Output: Dataframe of date of procedure for each part number embedded in free-text (Typically: `pathology_spec_part_dop.csv`)

In [None]:
# Create annotation for date of procedure (DOP) for all pathology reports/specimen part, if indicated
if run_parse_dop:
    print('Running PathologyExtractDOP...')
    obj_dop = PathologyExtractDOP(
        fname_minio_env=fname_minio_env,
        fname=fname_path_surgical_reports_by_parts,
        col_label_access_num=col_label_access_num,
        col_label_spec_num=col_label_spec_num,
        col_spec_sub=col_spec_sub,
        list_accession=None,
        fname_out=fname_path_dop
    )

In [None]:
if run_parse_dop:
    df_obj_dop = obj_dop.return_df()
    df_obj_dop[df_obj_dop['DATE_OF_PROCEDURE_SURGICAL'].notnull()].shape

### CombineAccessionDOPImpact
**To generate a summary table of source accession and DOP for a given  pathology report.**
- Source file: `pathology_extract_dop_impact_wrapper.py`
- Input: 
    - fname_accession=`path_accessions.csv`
    - fname_dop=`pathology_spec_part_dop.csv`
    - fname_path=`table_pathology_clean.csv`
- Output: `pathology_dop_impact_summary.csv`

In [None]:
# Create table of M accessions of IMPACT samples, source accession number, dates of reports and procedures
#Call pathology_extract_dop_impact_wrapper.py
if run_CombineAccessionDOPImpact:
    print('Running CombineAccessionDOPImpact...')
    obj_p = CombineAccessionDOPImpact(
        fname_minio_env=fname_minio_env,
        fname_accession=fname_path_accession,
        fname_dop=fname_path_dop,
        fname_path=fname_path_clean,
        fname_save=fname_path_summary_sparse
    )

In [None]:
if run_CombineAccessionDOPImpact:
    df_dop_accession_combined_sparse = obj_p.return_df()
    df_dop_accession_combined_sparse.head();


### PathologyImpactDOPAnno
**To fill in missing DOPs by comparing dates of source surgical pathology reports with surgical and IR dates and labelling positive comparisons as the DOP.**


In [None]:
# Add annoations for surgical reports that on the same day as the surgery/IR
# Call pathology_impact_summary_dop_annotator.py
if run_dop_extra_anno:
    print('Running CombineAccessionDOPImpact...')
    obj_dop_anno = PathologyImpactDOPAnno(
        fname_minio_env=fname_minio_env,
        fname_path_summary=fname_path_summary_sparse,
        fname_surgery=fname_surgeries,
        fname_ir=fname_ir,
        fname_save=fname_path_summary_final
    )

In [None]:
if run_dop_extra_anno:
    df_f = obj_dop_anno.return_summary()
    df_f.head()

In [None]:
run_dop_extra_anno