/
soup.py
85 lines (53 loc) · 2.27 KB
/
soup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
https://beautiful-soup-4.readthedocs.org/en/latest/
from bs4 import BeautifulSoup
#Create a soup object to traverse XML
soup = BeautifulSoup(open("wordpress.xml"), "xml")
items = soup.find_all('content:encoded')
#items = soup.find_all('item')
#thingers = items[45].contents
#content = items.find_all('content')
print(items)
# navigate through each item tag
for item in items:
post = item.find_all("content:encoded")
print(post)
"""
"""
Initial code drawn from: https://code.activestate.com/recipes/551792-convert-wordpress-export-file-to-multiple-html-fil/
"""
import string, os, sys, getopt
from xml.dom import minidom
from jekyll import post_create
import urllib
from unidecode import unidecode
dom = minidom.parse("wordpress.xml")
blog = [] # list that will contain all posts
for node in dom.getElementsByTagName('item'):
post = dict()
# only work on posts:
# for something to be a post it must have a <title> attr
# as well as a <content:encoded> attr
# everything else is metadata
if node.getElementsByTagName('title')[0].firstChild is not None and \
node.getElementsByTagName('content:encoded')[0].firstChild is not None:
post["title"] = node.getElementsByTagName('title')[0].firstChild.data
post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
post["author"] = node.getElementsByTagName('dc:creator')[0].firstChild.data
post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data
post["text"] = node.getElementsByTagName('content:encoded')[0].firstChild.data
# Get the categories
tempCategories = []
for subnode in node.getElementsByTagName('category'):
tempCategories.append(subnode.getAttribute('nicename'))
categories = [x for x in tempCategories if x != '']
post["categories"] = categories
# Add post to the list of all posts
blog.append(post)
elif node.getElementsByTagName('wp:attachment_url'):
img_url = unidecode(node.getElementsByTagName('wp:attachment_url')[0].firstChild.data)
file_name = img_url[img_url.rfind("/") + 1:]
img_dir = 'root/downloads/'
urllib.urlretrieve(img_url, img_dir + file_name)
for page in blog:
post_create(page)