# Nature Protocol Manuscript Conversion
Code to convert xQTL pipeline notebooks to format for Nature Protocol paper  


In [310]:
import json
import re
import yaml


## Setup

In [311]:
def append_to_markdown(output_markdown_file, content):
    with open(output_markdown_file, 'a') as md_file:
        md_file.write(content + '\n\n')


In [312]:
manuscript_format_notebook = f"example_manuscript.ipynb"
output_markdown_file = f"output_markdown.md"

In [313]:
major_sections_keep = [
    'Reference data',
    'Molecular Phenotypes',
    'Data Pre-processing'
]

miniprotocol_keep = [
    '../../code/reference_data/reference_data.ipynb',
    '../../code/molecular_phenotypes/bulk_expression.ipynb',
    '../../code/molecular_phenotypes/splicing.ipynb',
    '../../code/data_preprocessing/genotype_preprocessing.ipynb',
    '../../code/data_preprocessing/phenotype_preprocessing.ipynb',
    '../../code/data_preprocessing/covariate_preprocessing.ipynb',

]

In [314]:

#list of miniprotocols to simplify for each section of the manuscript
miniprotocol_simplify = [
    '../../code/reference_data/reference_data.ipynb',
]

In [315]:

# Specify the path to your YAML file
yaml_file_path = "../_toc.yml"

# Load the YAML file
with open(yaml_file_path, "r") as file:
    yaml_data = yaml.safe_load(file)

In [316]:
#dictionary with keys being the names of the major sections and 
#values being lists of the miniprotocol notebooks for that major section
#these values should match the keys used in the 'miniprotocol_dict' below
major_section_dict = {}
for part in yaml_data['parts']:
    caption = part['caption']
    #filter
    if caption in major_sections_keep: 
        miniprotocols = [f"../../{file['file']}" for file in part['chapters'] if f"../../{file['file']}" in miniprotocol_keep]
        major_section_dict[caption] = miniprotocols
major_section_dict

###used to be in this format:
#major_section_dict =  {
#    "Molecular Phenotype Quantification": [
#        f"{WRKDIR}/bulk_expression/bulk_expression.ipynb",
#        f"{WRKDIR}/splicing/splicing.ipynb"
#    ],
#    "Data Pre-Processing":[
#        f"{WRKDIR}/data_preprocessing/covariate/covariate_preprocessing.ipynb",
#        ],
#    "QTL Association Analysis":[],
#    "Integrative Analysis":[]
#}

{'Reference data': ['../../code/reference_data/reference_data.ipynb'],
 'Molecular Phenotypes': ['../../code/molecular_phenotypes/bulk_expression.ipynb',
  '../../code/molecular_phenotypes/splicing.ipynb'],
 'Data Pre-processing': ['../../code/data_preprocessing/genotype_preprocessing.ipynb',
  '../../code/data_preprocessing/phenotype_preprocessing.ipynb',
  '../../code/data_preprocessing/covariate_preprocessing.ipynb']}

In [317]:
#dictionary with keys being the mininprotocol notebooks and 
#values being lists of the module notebooks for the miniprotocol
miniprotocol_dict = {}
for part in yaml_data['parts']:
    for chapter in part['chapters']:
        miniprotocol = f"../../{chapter['file']}"
        #filter
        if miniprotocol in miniprotocol_keep:
            if 'sections' in chapter:
                miniprotocol_dict[miniprotocol] = [f"../../{module['file']}" for module in chapter['sections']]
            else:
                miniprotocol_dict[miniprotocol] = []
miniprotocol_dict
###used to be in this format:
#miniprotocol_dict = {
#    f"{WRKDIR}/bulk_expression/bulk_expression.ipynb":[
#        f"{WRKDIR}/bulk_expression/RNA_calling.ipynb",
#        f"{WRKDIR}/bulk_expression/bulk_expression_QC.ipynb",
#        f"{WRKDIR}/bulk_expression/bulk_expression_normalization.ipynb"
#    ],
#    f"{WRKDIR}/splicing/splicing.ipynb":[
#        f"{WRKDIR}/splicing/splicing_calling.ipynb",
#        f"{WRKDIR}/splicing/splicing_normalization.ipynb"
#    ],
#    f"{WRKDIR}/data_preprocessing/covariate/covariate_preprocessing.ipynb":[
#        f"{WRKDIR}/data_preprocessing/covariate/covariate_formatting.ipynb",
#        f"{WRKDIR}/data_preprocessing/covariate/covariate_hidden_factor.ipynb"
#    ]
#}        

{'../../code/reference_data/reference_data.ipynb': ['../../code/reference_data/reference_data_preparation.ipynb',
  '../../code/reference_data/generalized_TADB.ipynb',
  '../../code/reference_data/ld_prune_reference.ipynb'],
 '../../code/molecular_phenotypes/bulk_expression.ipynb': ['../../code/molecular_phenotypes/calling/RNA_calling.ipynb',
  '../../code/molecular_phenotypes/QC/bulk_expression_QC.ipynb',
  '../../code/molecular_phenotypes/QC/bulk_expression_normalization.ipynb'],
 '../../code/molecular_phenotypes/splicing.ipynb': ['../../code/molecular_phenotypes/calling/splicing_calling.ipynb',
  '../../code/molecular_phenotypes/QC/splicing_normalization.ipynb'],
 '../../code/data_preprocessing/genotype_preprocessing.ipynb': ['../../code/data_preprocessing/genotype/VCF_QC.ipynb',
  '../../code/data_preprocessing/genotype/GWAS_QC.ipynb',
  '../../code/data_preprocessing/genotype/PCA.ipynb',
  '../../code/data_preprocessing/genotype/GRM.ipynb',
  '../../code/data_preprocessing/genotyp

In [318]:
#read the miniprotocol notebook and get the title (should be first cell)
def get_miniprot_notebook_title(notebook_str):
    with open(notebook_str, 'r') as miniprot_content:
        miniprot_notebook = json.load(miniprot_content)
        for i, cell in enumerate(miniprot_notebook["cells"]):
            
            if cell["cell_type"] == "markdown":
                
                if len(cell["source"]) >0:
                    if cell["source"][0].startswith("# "):
                        miniprot_title = cell["source"][0]
                        miniprot_title = miniprot_title.replace("#","")
                        print(miniprot_title)
                        return miniprot_title

In [319]:
#dictionary listing the procedure steps that will keep the full sos command in the manuscript (including parameters)
#by default we will shorten the sos lines to remove parameters to improve readability
#keys are the module notebook names (should match the values in the miniprotocol_dict above)
#values are the names of the actual steps (should match the step name in the miniprotocol notebooks)
full_code_steps = {
    "../../code/molecular_phenotypes/calling/RNA_calling.ipynb":[
        "Perform data quality summary via `fastqc`",
        "Cut adaptor (Optional)"
       ]
}

## Experimental Design Conversion

In [320]:
#get the content for the experimental design of the manuscript by going through each miniprotocol for the 
#title and through each module for the descriptions
def content_for_exp_design():
    return_content = []
    major_step = 1
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        return_content.append(f"#### {major_section} (Step {major_step})")
        major_step += 1
        


        miniprot_step = 1
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:
            
            #get the title of the miniprotocol
            miniprot_title = f"##### {chr(ord('@')+miniprot_step)}. {get_miniprot_notebook_title(miniprot)}"
            return_content.append(miniprot_title)
            
            #don't do much if this is one of the miniprotocols sections we want to simplify
            if miniprot in miniprotocol_simplify:
                return_content.append(f"Please refer to the protocol website for more information on this miniprotocol.")
            else:
                #iterate over modules in each miniprotocol                                    
                for module in miniprotocol_dict[miniprot]:
                    with open(module, 'r') as module_content:
                        #flag to get the next cell (or cell after description header in this case)
                        get_next_cell = False
                        module_notebook = json.load(module_content)
                        for i, cell in enumerate(module_notebook["cells"]):
                            if cell["cell_type"] == "markdown":
                                if len(cell["source"]) >0:
                                    content = cell["source"][0]


                                    if content.startswith("##") and get_next_cell:
                                        #reset the flag
                                        get_next_cell = False
                                    #add the description text
                                    if get_next_cell:

                                        return_content.append("\n" + "\n".join(cell["source"]))
                                    if cell["source"][0].startswith("## Description"):
                                        #tells us to get the next cell after the this one for the output
                                        get_next_cell = True

            miniprot_step += 1
    return "\n".join(return_content)

## Procedure Conversion

In [321]:
def content_for_procedure():
    return_content = []
    return_content.append("> CRITICAL  \n  To improve readability, the code outlined here highlights the notebook to run for each step and not the necessary parameters in some cases. Please refer to the protocol website for information on what parameters to include.")
    major_step = 1
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        return_content.append(f"### {major_step}. {major_section}")
        major_step += 1
        
        miniprot_step = 1
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:
            #get the title of the miniprotocol
            miniprot_title = f"#### {chr(ord('@')+miniprot_step)}. {get_miniprot_notebook_title(miniprot)}"
            return_content.append(miniprot_title)

            miniprot_step += 1
            #don't do much if this is one of the miniprotocols sections we want to simplify
            if miniprot in miniprotocol_simplify:
                return_content.append(f"Please refer to the protocol website for more information on this miniprotocol.")
            else:
                #iterate over modules in each miniprotocol to read the Minimal Working Example
                for module in miniprotocol_dict[miniprot]:
                    with open(module, 'r') as module_content:
                        #flag for if we have found the mwe cell
                        in_mwe = False
                        keep_code_params = False
                        module_notebook = json.load(module_content)
                        for i, cell in enumerate(module_notebook["cells"]):
                            if cell["cell_type"] == "markdown":
                                if len(cell["source"]) >0:
                                    content = cell["source"][0]

                                    #if in the mwe part
                                    if in_mwe:
                                        #this is one of the sections with code
                                        if content.startswith("### "):
                                            sub_title = content

                                            keep_code_params = should_keep_sos_params(sub_title, module)

                                            sub_title = sub_title.replace("###", "#####")
                                            return_content.append(sub_title)
                                        #if we are on the cell with the timing info
                                        if content.startswith("Timing"):
                                            return_content.append(content.replace(":",""))
                                        #reset the flag if we are at the troubleshooting part (after mwe)
                                        if content.startswith("## "):
                                            in_mwe = False

                                    #check if we have reached the MWE part of the module
                                    if content.startswith("## Minimal Working Example"):
                                        #tells us to get the next cell after the this one for the output
                                        in_mwe = True
                            #if at a code cell and in MWE, then add the code to the output
                            elif cell["cell_type"] == "code" and in_mwe:
                                if keep_code_params:
                                    code_content = '\n```\n' + ''.join(cell["source"]) + '\n```\n\n'
                                else:
                                    code_content = simplify_sos_code(cell["source"])
                                return_content.append(code_content)                        

    return "\n".join(return_content)

In [322]:
#checks the full_code_steps dictionary to determine if the step_name is in there
def should_keep_sos_params(step_name, module):
    if module in full_code_steps.keys():
        for step in full_code_steps[module]:
            if step_name.split('.', 1)[1].strip() in step:
                return True
    return False

In [323]:
#simplify an sos code block to only include the part with the notebook and the container
def simplify_sos_code(code):
    filtered_code = []
    for c in code:
        if "ipynb" in c:
            filtered_code.append(c)
        elif "--container" in c:
            filtered_code.append(c)
    return '\n```\n' + ''.join(filtered_code) + '\n```\n\n'

## Timing Conversion

In [324]:
#get the content for the timing seection of the manuscript by going through each miniprotocol
def content_for_timing():
    return_content = []
    return_content.append(f"| Step(Major Section) | Substep(Miniprotocol) | Time|")
    return_content.append(f"|------|-----|----|")
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        table_row = ""
        table_row = table_row + f"|{major_section}"
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:
            with open(miniprot, 'r') as miniprot_content:
                #get the title of the miniprotocol
                miniprot_title = f"{get_miniprot_notebook_title(miniprot)}".replace("\n","")
                table_row = table_row + f"|{miniprot_title}"

                miniprot_notebook = json.load(miniprot_content)
                for i, cell in enumerate(miniprot_notebook["cells"]):
                    if cell["cell_type"] == "markdown":
                        if len(cell["source"]) >0:
                            content = cell["source"][0]
                            if content.startswith("#### Miniprotocol Timing"):
                                for c in cell["source"]:
                                    if c.startswith("Timing"):
                                        table_row = table_row +f"|{c.replace('Timing','')}|"
                                        return_content.append(table_row)
                                        break
                table_row = "| "
        table_row = ""
    return "\n".join(return_content)

## Anticipated Results Conversion

In [325]:
#get the content for the anticipated results seection of the manuscript by going through each miniprotocol
def content_for_anticipated_results():
    
    return_content = []
    
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        miniprot_step = 1
        on_results = False
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:

            #get the title of the miniprotocol
            miniprot_title = f"#### {chr(ord('@')+miniprot_step)}. {get_miniprot_notebook_title(miniprot)}"
            return_content.append(miniprot_title)
            
            miniprot_step += 1
                
            with open(miniprot, 'r') as miniprot_content:

                miniprot_notebook = json.load(miniprot_content)
                for i, cell in enumerate(miniprot_notebook["cells"]):
                    if cell["cell_type"] == "markdown":
                        if len(cell["source"]) >0:
                            content = cell["source"][0]
                            if on_results:
                                return_content.append(content)
                                on_results = False
                            if content.startswith("## Anticipated Results"):
                                on_results = True
    return "\n".join(return_content)

## References Conversion

In [326]:
def content_for_references():
    
    return_content = []
    
    #hold the references in a list fromatted as "author year doi"
    #use this to check for duplicates and to order correctly
    ref_list = []
    
    
    #for now just check through the experimental design text 
    exp_design = content_for_exp_design()
    
    
    
    # look for the pattern for references. Should look like:
    #[cf. Signal et al (2022)](https://doi.org/10.1186/s12859-022-04572-7)
    pattern = r'\[cf(.*?)\]\((.*?)\)'

    # Use re.findall to find all occurrences of the pattern in the input string
    matches = re.findall(pattern, exp_design)

    # look through all the matches and add to the ref_list
    for match in matches:
        ref_text = match[0]
        doi = match[1]
        year = re.findall(r'\b\d{4}\b', match[0])[0]
        author = re.findall(r'\s([a-zA-Z]+)\s', match[0])[0]
        
        for_ref_list = f"{author} et al. {year}. {doi}"
        #make sure it isn't already added before adding
        if for_ref_list not in ref_list:
            ref_list.append(for_ref_list)
    
    
    ref_num = 1
    #now iterate through the ref_list and add to the return content
    for ref in ref_list:
        return_content.append(f"{ref_num}. {ref} ")
        ref_num+=1
    return "\n".join(return_content)

    
    

# Do the conversion
This reads through the example_manuscript.ipynb and the miniprotocol and module notebooks to create a markdown file

In [327]:
with open(manuscript_format_notebook, 'r') as manuscript_format:
    notebook = json.load(manuscript_format)
# Clearing the content of the markdown file before appending new content
open(output_markdown_file, 'w').close()
#flag to tell us if we are in the procedure part. Used to skip some of the content in 
#the example_manuscript.ipynb that will be added programmatically here
in_procedure = False

for i, cell in enumerate(notebook["cells"]):
    if cell["cell_type"] == "markdown":

        if len(cell["source"]) >0:

            content = cell["source"][0]
            # one of the main sections (Title, Abstract, Procedure, etc...)
            if content.startswith("## "):
                section_title = ''.join(cell["source"]) + '\n\n'
                append_to_markdown(output_markdown_file, section_title)
                #if in procedure and we hit a new section, then we are no longer in the procedure section
                if in_procedure:
                    in_procedure = False
                #get content for procedure section
                if content.startswith("## Procedure"):
                    in_procedure = True
                    proc = content_for_procedure()
                    append_to_markdown(output_markdown_file, proc)
                #get content for timing section
                if content.startswith("## Timing"):
                    in_procedure = False
                    timing = content_for_timing()
                    append_to_markdown(output_markdown_file, timing)
                #get content for anticipated results section
                if content.startswith("## Anticipated Results"):
                    in_procedure = False
                    antires = content_for_anticipated_results()
                    append_to_markdown(output_markdown_file, antires)
                #get content for references section
                if content.startswith("## References"):
                    in_procedure = False
                    ref = content_for_references()
                    append_to_markdown(output_markdown_file, ref)
            #other sub sections
            if content.startswith("### ") and not in_procedure:
                section_title = ''.join(cell["source"]) + '\n\n'

                append_to_markdown(output_markdown_file, section_title)
                # experimental design subsection of Introduction
                if content.startswith("### Experimental Design"):
                    exp = content_for_exp_design()

                    append_to_markdown(output_markdown_file, exp)


 Reference Data

 RNA-seq expression

 Alternative splicing from RNA-seq data

 Genotype data preprocessing

 Phenotype data preprocessing

 Covariate Data Preprocessing

 Reference Data

 RNA-seq expression

 Alternative splicing from RNA-seq data

 Genotype data preprocessing

 Phenotype data preprocessing

 Covariate Data Preprocessing

 Reference Data

 RNA-seq expression

 Alternative splicing from RNA-seq data

 Genotype data preprocessing

 Phenotype data preprocessing

 Covariate Data Preprocessing

 Reference Data

 RNA-seq expression

 Alternative splicing from RNA-seq data

 Genotype data preprocessing

 Phenotype data preprocessing

 Covariate Data Preprocessing

 Reference Data

 RNA-seq expression

 Alternative splicing from RNA-seq data

 Genotype data preprocessing

 Phenotype data preprocessing

 Covariate Data Preprocessing

