# Getting R Companion html and converting to markdown text

Meant to be run in sessions launched from [here](https://github.com/fomightez/rcomp_testenv). (This can also be run [here](https://github.com/fomightez/muscle-binder) where I added the recent version of pandoc on Feb 3, 2019.) That version added conversion to markdown.

This is based on approach worked out in `developing automating getting circos html and converting to markdown text.ipynb`.

In [None]:
%pip install beautifulsoup4

In [None]:
r_companion_index_url = "https://rcompanion.org/rcompanion/index.html"

In [None]:
site_prefix = "https://rcompanion.org/rcompanion/"

import os
import sys
import urllib.request
from bs4 import BeautifulSoup as BS

def extract_name_of_the_html(url, add_html_extension):
    '''
    make a file name based on the URL "https://rcompanion.org/rcompanion/index.html".
    if `add_html_extension` is True than add `.html` extension
    to the file name.
    
    Return filename
    '''
    split_url = url.split("/")
    fn = split_url[-1]
    if add_html_extension:
        fn += ".html"
    if fn == 'index.html':
        fn = "rcomp_index.html"
    return fn

def get_html_and_save(url):
    '''
    Take a url for a web page get the html and stores the text.
    Returns the html code too
    
    based on https://stackoverflow.com/a/30890016/8508004
    '''
    global the_html # so can save using `%store` the variable needs to be global
    global fn_save_name # so can save using `%store` the variable needs to be global
    hh = urllib.request.urlopen(url)
    hbytes = hh.read()

    the_html = hbytes.decode("utf8")
    #print (the_html[:200])
    hh.close()
    
    fn_save_name = extract_name_of_the_html(url, add_html_extension=False)
    
    %store the_html > {fn_save_name}
    
    return the_html 


pages_and_titles_dict = {}
index_html = get_html_and_save(r_companion_index_url)
# mine from the Contents panel on the left, the list of the pages
nav_code = index_html.split("<!-- Begin Navigation -->")[1].split("<!-- End Navigation -->")[0]
contents_code = nav_code.split("<ul>Introduction")[1].split('<div id="adskyscraper">')[0]
#print(nav_code )

# ul and li tags based on https://stackoverflow.com/a/17246983/8508004
soup = BS(nav_code)
for ultag in soup.find_all('ul'):
    for litag in ultag.find_all('li'):
        #print(litag.text.strip())  #<--ends up being same as `print(link.text.strip())`
        pass
        for link in litag.find_all('a'):
            #print(link.get('title')) #based on https://stackoverflow.com/a/32542575/8508004
            #print(link.text.strip())
            #print(link.get('href')) #based on https://python.gotrained.com/beautifulsoup-extracting-urls/
            if link.get('href').startswith("http://rcompanion.org/"):
                full_link = link.get('href')
            else:
                full_link = f"{site_prefix}{link.get('href')}"
            pages_and_titles_dict[full_link] = link.text.strip()
pages_and_titles_dict

Remove the last one that simply leads to another set of pages and no R code. Plus don't get index from there again since already dealt with what is needed. (Note above, renamed the `index` file from RComp so that it doesn't clobber the Jupyter environment notebook with the same name.)

In [None]:
remove_specifically = [r_companion_index_url, "http://rcompanion.org/handbook/"]
for p in remove_specifically:
    del pages_and_titles_dict[p]

## Now get the html for each, set up to deal with werid characters, prepate the code blocks and results, and make preliminary markdown from each page.

In [None]:
urls_to_get = list(pages_and_titles_dict.keys())

In [None]:
import os
import sys
import urllib.request

def move_code_and_results_tags_off_same_line(the_html):
    '''
    The HTML includes occurences like this:
    ```html
    <p class=c1>library(DescTools)<br><br>BreslowDayTest(Tabla)</p><p class=c2><br><br>Breslow-Day Test 
    for Homogeneity of the Odds Ratios<br><br>X-squared = 4.4517, df = 4, p-value = 
    0.3483<br></p>
    ```
    
    It would be easier to prepar code fences if such occurence were broken up into two lines.
    '''
    code_start_tag = "<p class=c1>"
    results_start_tag = "<p class=c2>"
    new_html = ""
    for line in the_html.split("\n"):
        if code_start_tag in line and results_start_tag in line:
            parts = line.split(results_start_tag)
            new_html += parts[0] + "\n" + results_start_tag + parts[1] + "\n"
        else:
            new_html += line + "\n"
    return new_html

def prepare_for_fencing_code(the_html):
    '''
    Takes HTML as a string from an R comp page & based on the c1 and c2 classes(and others) 
    start and end placing placeholder text to later mark starts and ends of code
    blocks. This way can fence the code in the markdown and 
    have it rendered in code blocks when notebooks made via notedown. 
    Have to mark the location of clode blocks at the HTMl stage because loses 
    the class tags when pandoc converts to markdown. So need something that will
    survive change to markdown and specify start and end of blocks. Doesn't have
    to be the final signals since I can add the backticks and code signal later.
    
    BASIS FOR RULES:
    DEfinitely start with instances of following when code_block not open:
    <p class=c1><span style='color:#006600'>###

    <p class=c1>hist(Data$ Angle,    <br>


    end code block when hit;
    <p class=c2>

    <p class=j1>
    
    <p class=j2>
    
    <h3>
    
    <h4>
    
    <h5>
    
    <p class=MsoNormal>




    Exclude lines that are solely:

    <p class=c1>&nbsp;</p>

    <p class=c1 style='margin-left:0in'>&nbsp;</p>

    <p class=c1 align=center style='text-align:center'>#     #     #</p>

    <p class=c1 align=center style='text-align:center'>&nbsp;</p>
    
    
    Also, fix so lines where dashes occur, have spaces in lines like 
    `### -----------------------------------------------` not changed to line endings, 
    but I moved this to `prepare_for_coloring_results()` function because
    need each line.

    '''
    # first address any occurences of code and results blocks on same lines, so things easier
    the_html = move_code_and_results_tags_off_same_line(the_html)
    
    # the approach on next two lines was made moot by more general handing of spaces.
    #the_html = the_html.replace('### -----------------------------------------------',
    #                            '###-----------------------------------------------')
    lines_to_ignore = ["<p class=c1>&nbsp;</p>",
                       "<p class=c1 style='margin-left:0in'>&nbsp;</p>",
                       "<p class=c1 align=center style='text-align:center'>#     #     #</p>",
                       "<p class=c1 align=center style='text-align:center'>&nbsp;</p>",
                       "<p class=c1 align=center style='text-align:center'>&nbsp;</p>",
                       "<p class=c1><span style='color:red'>" #for b_09.html and d_05.html only
                      ]
    start_placeholder_tag = "RCOMPxCODExFENCExSTARTSxHERE"
    start_placeholder_tag = "RCOMPxCODExFENCExSTARTSxHERE"
    end_placeholder_tag = "RCOMPxCODExFENCExENDSxHERE"
    in_code_block = False
    new_html = ""
    for line in the_html.split("\n"):
        if (not in_code_block and (
            line not in lines_to_ignore) and line.strip().startswith("<p class=c1>") ):
            in_code_block = True
            #split_line = line.split("class=c1",1)
            #new_html += split_line[0] + start_placeholder_tag+"\n" + split_line[1]
            new_html += start_placeholder_tag+"\n"+line + "\n"
        elif (in_code_block and line.strip().startswith("<p class=c2>")) or (
            in_code_block and line.strip().startswith("<p class=j1>")) or (
            in_code_block and line.strip().startswith("<p class=j2>")) or (
            in_code_block and line.strip().startswith("<h3>")) or (
            in_code_block and line.strip().startswith("<h4>")) or (
            in_code_block and line.strip().startswith("<h5>")) or (
            in_code_block and line.strip().startswith("<p class=MsoNormal>")):
            new_html += end_placeholder_tag+"\n" + line + "\n"
            in_code_block = False
        else:
            if line.strip() != "":
                new_html += line + "\n"
    
    return new_html

def avoid_line_breaks_caused_by_comments(the_html):
    '''
    Comment symbols in clode blocks seem to cause line breaks to be added where there shouldn't be. They should end with `<br>` or `</p>`.
    I think if remove the line break and add temporary space handler as an indicator, it should work and
    the indicator will be removed in later processing.
    '''
    start_placeholder_tag = "RCOMPxCODExFENCExSTARTSxHERE"
    end_placeholder_tag = "RCOMPxCODExFENCExENDSxHERE"
    space_placeholder = u'ASxSPACE'
    split_on_start = the_html.split(start_placeholder_tag)
    new_html = ""
    for p in split_on_start:
        if end_placeholder_tag in p:
            code_and_not_code_parts = p.split(end_placeholder_tag)
            code_part = re.sub('(#.*)\n(.*)<br>',r'\1ASxSPACE\2',code_and_not_code_parts[0],re.S) # This has a drawback in that it can
            # insert a space at the start of the next line where the comment doesn't continue on so I added a check for such
            # cases below by first no extra, in appropriate space inserted at start of a line
            code_part = re.sub('(#.*)\n(.*)</span></p>',r'\1ASxSPACE\2',code_part,re.S) # `</span>` in front of `</p>` is important or it will act on same line again to append on line that follows and ends in `</p>`
            code_part = re.sub('(#.*)\n(.*)</span>!</p>',r'\1ASxSPACE\2!',code_part,re.S) # in at least one place (on `b_01.html`) there is a case where exclamation point between `</span>` & `</p>` where comment runs onto next line; this fixes that
            code_part = re.sub('<br>ASxSPACE</span>',r'<br>\nASxSPACE<br>\n',code_part,re.S)
            code_part = re.sub(r'--------\n</span><br>\n',r'--------br>\nASxSPACE<br>\n',code_part,re.S)
            #If inappropriate space entered at start of a line by action of `e.sub('(#.*)\n(.*)<br>',r'\1ASxSPACE\2'`
            # remove it, in a simplistic attempt. Example where was happening was in front of  `M2  = 71` in ` d_01`
            # first, collect start of original html block and new.==> UPDATE Simplistic approach stopped
            # working after I changed something and so it became more complex.
            num_letters = 2 #Number of the letters of the first line to use to compare
            ''' <--# COMENTING THIS SECTION OUT FOR NOW b/c it makes spaces go away from in front of numbers in second cod block on page d_01. Better way to check there was no real space in front before?
            original_line_starts = [l[:len(space_placeholder)+num_letters] for l in code_and_not_code_parts[0].split("\n")]
            new_line_starts = [l[:len(space_placeholder)+num_letters] for l in code_part.split("\n")]
            change_in_line_number = max([len(original_line_starts),len(new_line_starts)])-min([len(original_line_starts),len(new_line_starts)])
            #if any of the new ones are same as the old ones with the space_placeholder
            # in front, then remove the `space_placeholder`. Can be more than one so
            # need to loop on each. Also the index of each shouldn't be able to change
            # by more than the difference in number of lines. (This is not perfect
            # because problem if start of lines very similar but should cover most simple cases.)
            # Originally was enumerating on the original line starts, but then 
            # it became a problem to determine which new line to edit. So switched
            # to enumerating the new line starts and then comparing to subset as if they had been changed
            new_line_parts = code_part.split("\n")
            for indx,new_start_pt in enumerate(new_line_starts):
                if change_in_line_number:
                    if change_in_line_number > 3 and indx < (len(new_line_starts)/2):
                        change_in_line_number = 3 # don't make lines to consider above and below expected index too many
                    start_pt = indx-change_in_line_number
                    if start_pt < 0:
                        start_pt = 0
                    lines_starts_to_consider = original_line_starts[start_pt:indx+change_in_line_number]
                else:
                    lines_starts_to_consider = [original_line_starts[indx]]
                #lines_starts_to_consider=[space_placeholder+x[:(len(x)-len(space_placeholder))] for x in lines_starts_to_consider]
                lines_starts_to_consider=[x[:num_letters]for x in lines_starts_to_consider] # just want first two letters because that is all
                # that can match when remove space holder
                if new_start_pt[:num_letters] in lines_starts_to_consider and (
                    new_start_pt[:len(space_placeholder)] == space_placeholder):
                    new_line_parts[indx] = new_line_parts[indx][len(space_placeholder):] # remove step
            code_part = "\n".join(new_line_parts)
            '''
            # do that with splitting on `<br>` too because example where was happening was in front of  `M2  = 71` in ` d_01
            # actually occurs after `<br>`
            original_line_starts = [l[:len(space_placeholder)+num_letters + 1] for l in code_and_not_code_parts[0].split("<br>")] #+ 1 to account for linebreak
            new_line_starts = [l[:len(space_placeholder)+num_letters] for l in code_part.split("<br>")]
            change_in_line_number = max([len(original_line_starts),len(new_line_starts)])-min([len(original_line_starts),len(new_line_starts)])
            new_line_parts = code_part.split("<br>")
            for indx,new_start_pt in enumerate(new_line_starts):
                if change_in_line_number:
                    if change_in_line_number > 3 and indx < (len(new_line_starts)/2):
                        change_in_line_number = 3 # don't make lines to consider above and below expected index too many
                    start_pt = indx-change_in_line_number
                    if start_pt < 0:
                        start_pt = 0
                    lines_starts_to_consider = original_line_starts[start_pt:indx+change_in_line_number+1]
                else:
                    lines_starts_to_consider = [original_line_starts[indx]]
                #lines_starts_to_consider=[space_placeholder+x[:(len(x)-len(space_placeholder))] for x in lines_starts_to_consider]
                # remove first character because after <br>, it normally is a line break
                lines_starts_to_consider=[x[1:]for x in lines_starts_to_consider]
                lines_starts_to_consider=[x[:num_letters]for x in lines_starts_to_consider] # just want first two letters because that is all
                # that can match when remove space holder
                # So if it is a space holder at start and same letters as original start
                #if "M2" in lines_starts_to_consider:
                #   print(new_start_pt[:len(space_placeholder)] == space_placeholder)
                if new_start_pt[len(
                    space_placeholder):len(space_placeholder)+num_letters] in lines_starts_to_consider and (
                    new_start_pt[:len(space_placeholder)] == space_placeholder):
                    new_line_parts[indx] = new_line_parts[indx][len(space_placeholder):] # remove step
            code_part = "<br>".join(new_line_parts)
            new_html += (start_placeholder_tag  + "\n" +
                                    code_part + "\n" + end_placeholder_tag  + "\n" + 
                                    code_and_not_code_parts[1] )
        else:
            new_html += p
    
    return new_html


def prepare_for_coloring_results(the_html):
    '''
    Takes HTML as a string from an R comp page & based on c2 class tag, adds
    placeholder that can be used for coloring results later so they appear
    like they do in HTML.
    
    It won't be perfect because is simplistic for now % won't handle 
    examples like below where spans two lines, but better than nothing and most seem
    to match the simplistic model:
    <p class=c2><img width=480 height=255
    src="images/c_01_03.jpg"></p>
    '''
    lines_to_ignore = ["<p class=c2>&nbsp;</p>"]
    results_class_tag = '<p class=c2>'
    results_class__end_tag = '</p>'
    start_placeholder_tag = "RCOMPxRESULTxLINExSTART"
    end_placeholder_tag = "RCOMPxRESULTxLINExEND"
    bracket_start_plchlder = "BRACKETxSTARTxPLCHLDR"#replace real brackets early so can be sure source later
    bracket_end_plchlder = "BRACKETxENDxPLCHLDR"#replace real brackets early so can be sure source later
    space_placeholder = "ASxSPACE" # Want to substitute now so not causing a new line when PANDOC converts to markdown
    new_html = ""
    continues2next_line = False
    for line in the_html.split("\n"):
        #while have lines address those with hashes to have the spaces protected.
        if "#" in line:
            line.replace(" ",space_placeholder)
        if continues2next_line and (line not in lines_to_ignore):
            if line.strip().endswith(results_class__end_tag):
                new_html += line[:-(len(results_class__end_tag))].strip().replace(" ",space_placeholder) + end_placeholder_tag + "\n"
                continues2next_line = False
            else:
                new_html += line.strip().replace(" ",space_placeholder) + space_placeholder
        elif (line not in lines_to_ignore) and (
            line.strip().startswith(results_class_tag)) and (
            line.strip().endswith(results_class__end_tag)):
            line = line.replace("[",bracket_start_plchlder) #replace real brackets early so can be sure source later
            line = line.replace("]",bracket_end_plchlder) #replace real brackets early so can be sure source later
            new_html += start_placeholder_tag + line[len(
                results_class_tag):-(len(results_class__end_tag))].strip(
                ).replace(" ",space_placeholder) + end_placeholder_tag + "\n"
        elif (line not in lines_to_ignore) and (
            line.strip().startswith(results_class_tag)):
            line = line.replace("[",bracket_start_plchlder) #replace real brackets early so can be sure source later
            line = line.replace("]",bracket_end_plchlder) #replace real brackets early so can be sure source later
            new_html += start_placeholder_tag + line[len(
                results_class_tag):].strip(
                ).replace(" ",space_placeholder) + space_placeholder
            continues2next_line = True
        elif (line not in lines_to_ignore):
            new_html += line + "\n"
    return new_html

def extract_name_of_the_html(url, add_html_extension):
    '''
    make a file name based on the URL "https://rcompanion.org/rcompanion/index.html".
    if `add_html_extension` is True than add `.html` extension
    to the file name.
    
    Return filename
    '''
    split_url = url.split("/")
    fn = split_url[-1]
    if add_html_extension:
        fn += ".html"
    return fn

def get_html_and_save(url):
    '''
    Take a url for a web page get the html and store the text.
    
    return the name of the html and the name of file to save.
    (Turns out `%store` magics didn't work in the function?!)
    
    based on https://stackoverflow.com/a/30890016/8508004
    '''
    hh = urllib.request.urlopen(url)
    hbytes = hh.read()

    the_html = hbytes.decode("utf8")
    #print (the_html[:200])
    hh.close()
    fn_save_name = extract_name_of_the_html(url, add_html_extension=False)
    
    #%store the_html > {fn_save_name} #seems cannot use this in a function?;
    # probably because it needs to be a global and here it would be local
    # variable it would be trying to save.
    
    return the_html,fn_save_name
import re
htmls_collected = []
markdowns_made = []
for url in urls_to_get:
    the_html,fn_save_name = get_html_and_save(url)
    if '<h1>Repeated G–tests of Goodness-of-Fit' in the_html and "<p class=c1><span style='color:red'>" not in the_html:
        sys.stderr.write("It seems the issue on b_09.html with `<p class=c1><span style='color:red'>` has been fixed."
            " Please remove the appropriate line from `lines_to_ignore` in `prepare_for_fencing_code()`  if fixed in all.")
    if '<h1>One-way Anova<' in the_html and "<p class=c1><span style='color:red'>" not in the_html:
        sys.stderr.write("It seems the issue on d_05.html with `<p class=c1><span style='color:red'>` has been fixed."
            " Please remove the appropriate line from `lines_to_ignore` in `prepare_for_fencing_code()` if fixed in all.")
    the_html = the_html.replace("<p class=c1><span style='color:red'>","<p class=c2>")
    space_placeholder = u'ASxSPACE'
    the_html = the_html.replace(u'\xa0', space_placeholder)
    the_html = the_html.replace(u'&nbsp;', "nonXbreakingXspace")
    # First 'normalize' the text. Was hopting change the weird `\xa0` that I am seeing as 'space' characters
    # in the tables INTO ACTUAL SPACES; based on https://stackoverflow.com/a/34669482/8508004 . But 
    # `the_html = unicodedata.normalize("NFKD", the_html)` was just seeming to remove them.
    import unicodedata
    the_html = unicodedata.normalize("NFKD", the_html) #`NFKD` removed the the weird `\xa0` characters but didn't replace with space; however, `the_html = the_html.replace(u'\xa0', the_html)` kept getting killed; `NFC` & `NFD` didn't touch the `\xa0` chars; NFKC seems to remove them and a lot of other actual content??!?
    the_html = prepare_for_fencing_code(the_html)
    the_html = avoid_line_breaks_caused_by_comments(the_html)
    the_html = prepare_for_coloring_results(the_html)
    space_placeholder_t = u'TEMPxHLD'
    the_html = the_html.replace(u' ', space_placeholder_t) #temporarily mask spaces so pandoc doesn't add line breaks between spaces and numbers
    %store the_html > {fn_save_name}
    htmls_collected.append(fn_save_name)
    markdown_name = fn_save_name.rsplit(".html")[0] + ".md"
    !pandoc -s -f html -t markdown {fn_save_name} -o {markdown_name}
    #remove the temporarily masked spaces
    with open(markdown_name, 'r') as input:
        the_md=input.read()
    the_md = the_md.replace(space_placeholder_t,u' ')
    the_md = the_md.replace("nonXbreakingXspace",u' ')
    %store the_md > {markdown_name}
    sys.stderr.write("'{}' has been generated.\n".format(markdown_name))
    markdowns_made.append(markdown_name)

In [None]:
# unescape the escaping of symbols and punctuation in what are to be code blocks
def unescape_punc_symbols(s, md_name):
    '''
    Takes a string and replaces the characters as described below:
    \" goes to "
    \# goes to #
    \$ goes to $
    \~ goes to ~
    \> goes to >
    \< goes to <
    \' goes to '
    \_ goes to _
    \- goes to -
    \* goes to *
    \| goes to |
    \^ goes to ^
    \[ goes to [
    \] goes to ]
    `[TEXT]{style="color:#006600"}` goes to `TEXT`
    `pp. NUMBER--` goes to `pp. NUMBER -` 
    \END_OF_LINE goes to just END_OF_LINE
    -- in lines with less than 10 dashes go to single dashes
    
    Also requires the name of the mardown so any special handling can be carried out
    '''
    s = s.replace(r'\"','"') # oddly why the lower two worked without being raw, this wouldn't. Saved by https://stackoverflow.com/a/6718322/8508004
    s = s.replace(r'\#','#')
    s = s.replace(r'\$','$')
    s = s.replace(r'\~','~')
    s = s.replace(r'\>','>')
    s = s.replace(r'\<','<')
    s = s.replace(r"\'","'")
    s = s.replace(r'\_','_')
    s = s.replace(r'\-','-')
    s = s.replace(r'\*','*')
    s = s.replace(r'\|','|')
    s = s.replace(r'\^','^')
    s = s.replace(r'\[','[')
    s = s.replace(r'\]',']')
    #s = ' '.join(re.findall(r'\[(.*?)\]{style=\"color:#006600\"}', s,re.S))
    # Use bruteforce because too hard to combine all needed it seems; however, skip 
    # for b_09 where text `title: 'R Companion: Repeated G--tests of Goodness-of-Fit'` is found
    # because otherwise causes too much stuff deleted since that code features a lot of brackets
    if "b_09.md" != md_name:
        without_brackets = ""
        tag = ']{style="color:#006600"}'
        total_occurences = s.count(tag)
        for indx,p in enumerate(s.split(tag)):
            if indx < total_occurences:
                p += tag
            #print("p",p)
            if tag in p:
                o= re.findall(r'(.*?)\[(.*?)\]{style=\"color:#006600\"}(.*?)', p, re.S) # because multiple groups means a list of tuples, see https://howchoo.com/g/zdvmogrlngz/python-regexes-findall-search-and-match
                flattened = [item for sublist in o for item in sublist]
                without_brackets += " ".join(flattened)
            else:
                without_brackets += p
        s = without_brackets
    else:
        s = s.replace(r'[#',r'#')
    s = re.sub(', pp.\n',', pp. ',s) # as prep for fixing `--` for page numbering, remove extra end of line pandoc puts after `pp.`
    s = re.sub('(pp. \d+.).',r'\1',s) # note the use of `r` for capture grpup from https://lzone.de/examples/Python%20re.sub
    #Now to handle removing the backslashes at end of line (WAY HARDER THAN EXPECTED TO DO THIS)
    #s = s.replace('\\','') # This, based on https://stackoverflow.com/a/17327500/8508004, seems to work to remove backslashes at ends of line but isn't specifying end of line. Prefer to use regex to specify
    #s = re.sub('\\$','',s) # doesn't seem to do what above line does at all. I'd hoped it would and then restrict to `\` at end of lines with regex.
    # So bruteforce recognizing end of a line with below by going line by line
    new_s = ""
    #print(s)
    for l in s.split("\n"):
        if l.endswith("\\"):
            new_s += l[:-1]+"\n"
        else:
            new_s += l+"\n"
            
    return new_s

def unescape_code_portions(md_text):
    '''
    Takes markdown file name and opens the markdown and fixes the
    punctuation and symbols so they aren't backslashed escaped.
    
    BASIS FOR RULES:
    \" goes to "
    \# goes to #
    \$ goes to $
    \~ goes to ~
    \> goes to >
    \< goes to <
    \' goes to '
    \_ goes to _
    \- goes to -
    \* goes to *
    \| goes to |
    \^ goes to ^
    \[ goes to [
    \] goes to ]
    `[TEXT]{style="color:#006600"}` goes to `TEXT`
    `pp. NUMBER--` goes to `pp. NUMBER -` 
    \END_OF_LINE goes to just END_OF_LINE

    '''
    lines_to_ignore = ["",]
    start_placeholder_tag = "RCOMPxCODExFENCExSTARTSxHERE"
    end_placeholder_tag = "RCOMPxCODExFENCExENDSxHERE"
    in_code_block = False
    new_md_text = ""
    with open(md, 'r') as input:
        all_md=input.read()
    blocks= all_md.split(start_placeholder_tag)
    for b in blocks:
        if end_placeholder_tag in b:
            parts_of_block = b.split(end_placeholder_tag)
            #new_md_text += start_placeholder_tag + parts_of_block[0].encode('utf-8').decode('unicode_escape') + end_placeholder_tag + " ".join(parts_of_block[1:]) # based on comments in https://stackoverflow.com/a/1885197/8508004  <-- didn't work
            #new_md_text += start_placeholder_tag + decode(encode(parts_of_block[0], 'latin-1', 'backslashreplace'), 'unicode-escape')+ end_placeholder_tag + " ".join(parts_of_block[1:]) # based on https://stackoverflow.com/a/57192592/8508004 <-- works all, INCLUDING CASES OF '\"' whereas `s = s.replace('\"','"')` didn't, but encode.decode removes the line breaks too
            new_md_text += start_placeholder_tag + unescape_punc_symbols(parts_of_block[0],md) + end_placeholder_tag + " ".join(parts_of_block[1:]) # for the quote handling, I needed https://stackoverflow.com/a/6718322/8508004. See the `unescape_punc_symbols` function.
        else:
            new_md_text += b
    return new_md_text



import os
import sys
import re

for md in markdowns_made:
    new_md = unescape_code_portions(md)
    %store new_md > temp.txt 
    !mv temp.txt {md}
    sys.stderr.write(f"Code blocks unescaped in {md}.")

Collecting all the results
-----------------------------------

Run the next cell to gather and archive both the produced markdown files.  
Also collect the two lists used here as json files so the contents can be used for automating filling in the markdown into Jupyter inpynb files.

In [None]:
archive_file_name = "FirstSetmarkdown_from_RCompanion.tar.gz"
import os
import sys
# store `urls_to_get` and `markdowns_made` as json since lighter-weight and more portable than pickling
# and the order of them wll correspond to the index I made so I can use them with papermill 
# in conjuction without needing to make a new dictionary.
RCompanion_urls_to_get_storedfn = "RCompanion_urls_to_get.json"
RCompanion_markdowns_made_storedfn = "RCompanion_markdowns_made.json"
import json
with open(RCompanion_urls_to_get_storedfn, 'w') as f:
    json.dump(urls_to_get, f)
with open(RCompanion_markdowns_made_storedfn, 'w') as f:
    json.dump(markdowns_made, f)
files_to_archive = markdowns_made + [RCompanion_urls_to_get_storedfn] + [RCompanion_markdowns_made_storedfn]
!tar czf {archive_file_name} {" ".join(files_to_archive)}
sys.stderr.write("***************************DONE***********************************\n"
    "'{}' generated. Download it.\n"
    "***************************DONE***********************************".format(archive_file_name))

In [None]:
#" ".join(markdowns_made)

Follow-up this with `Generating R Companion notebooks from extracted markdown via notedown.ipynb`.