In [1]:
# default_exp wp2md

# wp2md

> Convert Wordpress to Markdown Files

In [2]:
#export
import re
from fastcore.utils import urljson, AttrDict, Path, first, test_eq, urlread, urlsave
from fastcore.script import call_parse, store_true, Param
from IPython.display import Markdown
from markdownify import markdownify as md

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
#export
def _getpost(url:str=None, post_id:int=None, baseurl:str=None):
    if url: post=urljson(url)
    else: post=urljson(f'{baseurl}/{post_id}')
    return AttrDict(post)

In [5]:
_post = _getpost('https://outerbounds.com/wp-json/wp/v2/posts/220')
assert _post

_post = _getpost(post_id=247, baseurl='https://outerbounds.com/wp-json/wp/v2/posts') #this is an alternate way of calling this 
assert _post

In [6]:
#export
_re_api = re.compile(r'<link rel="alternate" type="application/json" href="(\S+)"')

def url2api(url):
    "Get the wordpress api endpoint to retrieve post from the public url"
    api = first(_re_api.findall(urlread(url)))
    if not api: raise Exception("Was not able to find Wordpress ID in site.  Pleasure ensure that the URL corresponds to a wordpress site.")
    return api

`url2api` allows you to retrieve the full api endpoint to get the contents of the wordpress article [as described here](https://developer.wordpress.org/rest-api/reference/posts/#retrieve-a-post) from any public-facing wordpress url! 

In [7]:
test_eq(url2api('https://outerbounds.com/blog/notebooks-in-production-with-metaflow/'), 'https://outerbounds.com/wp-json/wp/v2/posts/220')

In [8]:
#export

_re_img = re.compile(r'\!\[.*?\]\((\S+)\)')

class WP:
    def __init__(self, url:str=None, baseurl:str=None, post_id:int=None):
        if url: self.apiurl = url2api(url)
        else: self.apiurl=f"{baseurl if baseurl.endswith('/') else baseurl+'/'}{post_id}"
        self.post = _getpost(self.apiurl)
        self.img_map = {}
        
    _props = ['title', 'date', 'tags', 'draft', 'description', 'image', 'slug']
    
    @property
    def mdimages(self): return _re_img.findall(self.raw_markdown)
    
    def save_images(self, dest_path, nb_path):
        for i,img in enumerate(self.mdimages):
            dest=Path(dest_path)/f'{i}_img'
            file_pth = urlsave(img, dest=dest)
            self.img_map[img] = str(file_pth.relative_to(nb_path))
            
    def _replace_images(self, md):
        md = self.raw_markdown
        for o,n in self.img_map.items():
            md = re.sub(o, n, md)
        return md
              
    @property
    def draft(self) -> str: 
        return str(self.post.draft != 'publish').lower()
    
    @property
    def title(self) -> str:
        title = self.post.get('title', None)
        return title.get('rendered', None) if title else title
    
    @property
    def image(self) -> str:
        img = self.post.get('uagb_featured_image_src', None)
        return first(img.get('large', [])) if img else img
    
    @property
    def frontmatter(self) -> str:
        fm = '---\n'
        for p in self._props:
            attr = getattr(self, p, None)
            if attr: fm+=f'{p}: "{attr}"\n'
        return fm+'---\n'
    
    def __getattr__(self, name):
        return self.post.get(name, None)
    
    @property
    def raw_markdown(self) -> str:
        return md(self.post.content['rendered'])
    
    @property
    def markdown(self) -> str: 
        "Return the markdown representation of the body of the post."
        return self.frontmatter + self._replace_images(self.raw_markdown)

    def tomd(self, dest_path:str=None, dest_file:str=None, download=True) -> None:
        "Write markdown representation of wordpress post"
        if not dest_path: dest_path = '.'
        if not dest_file: dest_file = self.slug+'.md'
        p = Path(dest_path)/dest_file
        if download: self.save_images(p.parent/f'_{p.stem}_data', nb_path=dest_path)
        print(f'Writing: {p}')
        p.write_text(self.markdown)

Instantiating a `WP` object will allow you to get access to useful properties that can render as front matter.

In [9]:
_post = WP('https://outerbounds.com/blog/notebooks-in-production-with-metaflow/')
print(_post.frontmatter)

---
title: "Notebooks In Production With Metaflow"
date: "2022-02-09T22:59:06"
image: "https://outerbounds.com/wp-content/uploads/2022/02/Screen-Shot-2022-02-09-at-12.45.20-pm-1024x525.png"
slug: "notebooks-in-production-with-metaflow"
---



In [10]:
_result="""---
title: "Notebooks In Production With Metaflow"
date: "2022-02-09T22:59:06"
image: "https://outerbounds.com/wp-content/uploads/2022/02/Screen-Shot-2022-02-09-at-12.45.20-pm-1024x525.png"
slug: "notebooks-in-production-with-metaflow"
---
"""
test_eq(_post.frontmatter, _result)

The markdown representation is also available as a property (below we print the first 1,000 characters:

In [11]:
print(_post.markdown[:1500])

---
title: "Notebooks In Production With Metaflow"
date: "2022-02-09T22:59:06"
image: "https://outerbounds.com/wp-content/uploads/2022/02/Screen-Shot-2022-02-09-at-12.45.20-pm-1024x525.png"
slug: "notebooks-in-production-with-metaflow"
---


By Hamel Husain


*Learn how to use notebooks in production ML workflows with a new Metaflow feature*


When building production-ready machine learning systems, it is critical to monitor the health and performance of those systems with reports and visualizations. Furthermore, allowing for rapid debugging and interactive introspection is critical when workflows fail or do unexpected things. Jupyter notebooks have often been a preferred tool of data scientists for these tasks of visualization, exploration, debugging, and rapid iteration.  Ironically, many production systems do not integrate appropriately with notebooks, which can significantly frustrate progress on these tasks.


Indeed, in the field of data science tooling, one of the most [hotly-co

### Getting Hidden Posts

A wordpress post may note be public (i.e. it might have a status other than `published`).  In that case, you will need two pieces of information to retrieve the markdown content for that post. 

1. The url for the api.  This is `<your_site>/wp-json/v2/posts`, for example `https://outerbounds.com/wp-json/wp/v2/posts`.  Note: _This is [the api route to retrieve a single WP post](https://developer.wordpress.org/rest-api/reference/posts/#retrieve-a-post)._

2. The `post id` you wish to convert to markdown. The post id can be extracted from wordpress edit url, for example the id for `https://outerbounds.com/wp-admin/post.php?post=220&action=edit` is `220`.

For example, we can get the contets of a post which has an id of `220` as follows:

In [12]:
_post = WP(baseurl='https://outerbounds.com/wp-json/wp/v2/posts', post_id=220)
test_eq(_post.title, 'Notebooks In Production With Metaflow')

### Downloading Images

`nb2md` also Downloads Images for you, and puts the images in a folder named `_<name_of_markdown_file>_data/`

In [13]:
assert len(_post.mdimages) == 5

In [14]:
_post.tomd()
_data_dir = Path('_notebooks-in-production-with-metaflow_data/')
assert _data_dir.exists()
_data_dir.ls()

Writing: notebooks-in-production-with-metaflow.md


(#5) [Path('_notebooks-in-production-with-metaflow_data/4_img'),Path('_notebooks-in-production-with-metaflow_data/2_img'),Path('_notebooks-in-production-with-metaflow_data/3_img'),Path('_notebooks-in-production-with-metaflow_data/1_img'),Path('_notebooks-in-production-with-metaflow_data/0_img')]

The paths to downloaded images are also automatically replaced in the markdown file:

In [15]:
_md = Path('notebooks-in-production-with-metaflow.md').read_text()
_img_url=r'https://lh4.googleusercontent.com/-8XLZezB4E7s64BQcu-KTZO0VWh4VyKXpNhVwzSqPSgrAC_3qu62OZ-fleSr5mGRqPYTcEm5ed_ZKx8o6W0o2JIQea1kCemhuX5cZMMZRRtumGi0yf0mIp3DJWvDzKGpUR9GP8ug'
assert _img_url in _post.raw_markdown
assert _img_url not in _md

In [16]:
#hide
#notest
!rm -rf _notebooks-in-production-with-metaflow_data/
!rm -rf notebooks-in-production-with-metaflow.md

## CLI Utility For Wordpress To Markdown

In [17]:
#export
@call_parse
def wp2md(url_or_id:Param('the public URL of the WP article OR the post id', str),
          apiurl:Param('the base url for the wordpress api to retrieve posts for your site.', str)='https://outerbounds.com/wp-json/wp/v2/posts',
          dest_path:Param('The path to save the markdown file to', str)='.',
          dest_file:Param('Name of destination markdown file. If not given defaults to the slug indicated in wordpress', str)=None,
          no_download:Param('Pass this flag to NOT download any images locally', store_true)=False,
         ):
    "Convert A wordpress post into markdown file with front matter."
    if url_or_id.isnumeric(): post = WP(baseurl=apiurl, post_id=url_or_id)
    else: post = WP(url=url_or_id)
    post.tomd(dest_path=dest_path, dest_file=dest_file, download=not no_download)