# Parsing Orange Express Wordpress post data from xml to csv

The purpose of this script is to parse the post data from .xml format to .csv file.

Note: data `oranjeexpress.WordPress.2020-12-06.xml` was exported from Wordpress on Dec 6 2020.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup 
import re

In [2]:
with open('./oranjeexpress.WordPress.2020-12-06.xml', 'r', encoding='utf8', errors='ignore') as f: 
    data = f.read() 

In [None]:
soup = BeautifulSoup(data, "xml")

A demonstration/glance of the xml data structure of a post:

```
<item>
    <title>{title}</title>
    <link>{long url link}</link>
    <pubDate>{published data}</pubDate>
    <dc:creator>{author, `<![CDATA[OliviaDung]]>`}</dc:creator>
    <guid isPermaLink="false">{short url link}</guid>
    <content:encoded>{content, formatted}</content:encoded>
    
    <wp:post_id>267</wp:post_id>
    <wp:post_date><![CDATA[2012-02-07 00:00:00]]></wp:post_date>
    <category domain="post_tag" nicename="%e8%8d%b7%e8%98%ad%e5%82%b3%e7%b5%b1"><![CDATA[傳統]]></category>
    <category domain="category" nicename="{omitted}"><![CDATA[吃喝 &amp; 玩樂]]></category>
</item>
```

In [None]:
def remove_tags(txt):
    """Remove xml/html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', str(txt))

In [None]:
# define the tags to be retrieved from each post data (excl. <category>, special fix needed)
metadata = ['title', 'dc:creator', 'wp:post_id', 'wp:post_date']
categoryDomAttr = ['post_tag', 'category']

def filterMetaDataPerPost(post):
    """
    Cleanup and filter out the targeted data for each input post data
    """
    miniSoup = BeautifulSoup(str(post))
    postData = {key: remove_tags(miniSoup.find(key)) for key in metadata}
    
    # special fix for <category> tags since attributes are involved
    for attr in categoryDomAttr:
        postData[attr] = [remove_tags(x).replace('&amp;amp;', '&') 
                          for x in miniSoup.find_all('category', {'domain': attr})]
        
    return postData

In [None]:
# get data!

posts = soup.find_all('item')
postsCleaned = list(map(filterMetaDataPerPost, posts))
df = pd.DataFrame(postsCleaned)
df.set_index('wp:post_id', inplace=True)

In [None]:
# check how data loooooooks
df.sample(5)

In [None]:
# save data to csv
df.to_csv('./data/oe-wp-posts.csv', encoding='utf_8_sig')