diff --git a/app/dashboard/routes/importer/sources/wordpress/index.js b/app/dashboard/routes/importer/sources/wordpress/index.js index fe7824278d7..ab8868210cb 100644 --- a/app/dashboard/routes/importer/sources/wordpress/index.js +++ b/app/dashboard/routes/importer/sources/wordpress/index.js @@ -7,23 +7,25 @@ var log = require("single-line-log").stdout; var Item = require("./item"); if (require.main === module) { - var args = process.argv.slice(2); - var outputDirectory = args.pop(); - var sourceFiles = args; + var options = {}; + var outputDirectory = process.argv[3]; + var sourceFile = process.argv[2]; - if (!outputDirectory || !sourceFiles.length) { + options.filter = process.argv[4]; + + if (!outputDirectory || !sourceFile.length) { console.log( - "Please pass XML export file(s) to convert and directory to output result:" + "Please pass XML export file to convert and directory to output result:" ); return console.log( - "node index.js export.xml [other-export.xml] output-directory" + "node index.js export.xml output-directory [filter-item-by-title]" ); } - console.log(colors.dim("Starting Wordpress import from"), sourceFiles); + console.log(colors.dim("Starting Wordpress import from"), sourceFile); console.log(colors.dim("Output directory:"), outputDirectory); - main(sourceFiles, outputDirectory, function(err) { + main(sourceFile, outputDirectory, options, function(err) { if (err) throw err; console.log(); @@ -32,45 +34,45 @@ if (require.main === module) { }); } -function main(sourceFiles, outputDirectory, callback) { +function main(sourceFile, outputDirectory, options, callback) { fs.emptyDirSync(outputDirectory); - async.eachSeries( - sourceFiles, - function(sourceFile, next) { - fs.readFile(sourceFile, "utf-8", function(err, xml) { - if (err) return next(err); + fs.readFile(sourceFile, "utf-8", function(err, xml) { + if (err) return callback(err); - parseXML(xml, function(err, result) { - if (err) return next(err); + parseXML(xml, function(err, result) { + if (err) return callback(err); - console.log(); - console.log(result.rss.channel[0].title[0]); - console.log(colors.dim("Site URL:"), result.rss.channel[0].link[0]); - console.log( - colors.dim("Export Version"), - result.rss.channel[0]["wp:wxr_version"][0] - ); + console.log(); + console.log(result.rss.channel[0].title[0]); + console.log(colors.dim("Site URL:"), result.rss.channel[0].link[0]); + console.log( + colors.dim("Export Version"), + result.rss.channel[0]["wp:wxr_version"][0] + ); - // If you want to see other properties available, - // log this to STDOUT - // console.log(result.rss.channel); + // If you want to see other properties available, + // log this to STDOUT + // console.log(result.rss.channel); - var totalItems = result.rss.channel[0].item.length; - - async.eachOfSeries( - result.rss.channel[0].item, - function(item, index, done) { - log(colors.dim(++index + "/" + totalItems), item.title[0]); - Item(item, outputDirectory, done); - }, - next - ); + if (options.filter) { + result.rss.channel[0].item = result.rss.channel[0].item.filter(function(item){ + return item.title[0].toLowerCase().indexOf(options.filter) > -1; }); - }); - }, - callback - ); + } + + var totalItems = result.rss.channel[0].item.length; + + async.eachOfSeries( + result.rss.channel[0].item, + function(item, index, done) { + log(colors.dim(++index + "/" + totalItems), item.title[0]); + Item(item, outputDirectory, done); + }, + callback + ); + }); + }); } module.exports = main; diff --git a/app/dashboard/routes/importer/sources/wordpress/item/convert_to_markdown.js b/app/dashboard/routes/importer/sources/wordpress/item/convert_to_markdown.js index 397b51e78c3..46455bf0b62 100644 --- a/app/dashboard/routes/importer/sources/wordpress/item/convert_to_markdown.js +++ b/app/dashboard/routes/importer/sources/wordpress/item/convert_to_markdown.js @@ -1,6 +1,23 @@ +var Turndown = require("turndown"); +var turndown = new Turndown(); +var debug = require('debug')('blot:importer:wordpress:markdown'); + module.exports = function(entry, callback) { - - entry.content = require("../../../helper").to_markdown(entry.html); - + debug(); + debug(); + debug('Input HTML:'); + debug(); + debug(entry.html); + + entry.content = turndown.turndown(entry.html); + + entry.content = entry.content.trim(); + + debug(); + debug(); + debug('Result:'); + debug(); + debug(entry.content); + callback(null, entry); }; diff --git a/app/dashboard/routes/importer/sources/wordpress/item/extract_entry.js b/app/dashboard/routes/importer/sources/wordpress/item/extract_entry.js index 8dcf524878c..87c89d54635 100644 --- a/app/dashboard/routes/importer/sources/wordpress/item/extract_entry.js +++ b/app/dashboard/routes/importer/sources/wordpress/item/extract_entry.js @@ -3,6 +3,8 @@ var moment = require("moment"); var join = require("path").join; var helper = require("../../../helper"); var determine_path = helper.determine_path; +var tidy = require("./tidy"); + module.exports = function(item, output_directory) { return function(callback) { var entry = {}; @@ -29,9 +31,11 @@ module.exports = function(item, output_directory) { determine_path(entry.title, entry.page, entry.draft, entry.dateStamp) ); - entry.tags = item.category.map(function(category) { - return category._; - }); + if (item.category) { + entry.tags = item.category.map(function(category) { + return category._; + }); + } callback(null, entry); }; diff --git a/app/dashboard/routes/importer/sources/wordpress/item/index.js b/app/dashboard/routes/importer/sources/wordpress/item/index.js index 51aa0a3bd99..c4a2acbc2fb 100644 --- a/app/dashboard/routes/importer/sources/wordpress/item/index.js +++ b/app/dashboard/routes/importer/sources/wordpress/item/index.js @@ -1,8 +1,8 @@ var async = require("async"); var helper = require("../../../helper"); var extract_entry = require("./extract_entry"); -var tidy_HTML = require("./tidy_HTML"); var convert_to_markdown = require("./convert_to_markdown"); +var tidy = require('./tidy'); module.exports = function(item, output_directory, callback) { @@ -18,7 +18,7 @@ module.exports = function(item, output_directory, callback) { async.waterfall( [ extract_entry(item, output_directory), - tidy_HTML, + tidy, helper.download_pdfs, helper.download_images, convert_to_markdown, diff --git a/app/dashboard/routes/importer/sources/wordpress/item/remove_inline_images.js b/app/dashboard/routes/importer/sources/wordpress/item/remove_inline_images.js new file mode 100644 index 00000000000..c416e8eb3b0 --- /dev/null +++ b/app/dashboard/routes/importer/sources/wordpress/item/remove_inline_images.js @@ -0,0 +1,28 @@ +var debug = require("debug")("blot:importer:wordpress:tidy_caption"); +var cheerio = require("cheerio"); + +module.exports = function(html) { + var $ = cheerio.load(html); + + debug($.html()); + + $("p") + .filter(function() { + return $(this).find("img").length; + }) + .each(function(i, p) { + if (!$(this).text()) return; + + $(this) + .find("a img") + .each(function(i, aWithImg) { + $('

' + $.html(aWithImg) + '

').insertBefore(p); + $(aWithImg).remove(); + }); + + }); + + debug($.html()); + + return $.html(); +}; diff --git a/app/dashboard/routes/importer/sources/wordpress/item/tidy.js b/app/dashboard/routes/importer/sources/wordpress/item/tidy.js new file mode 100644 index 00000000000..6dbc1ca505e --- /dev/null +++ b/app/dashboard/routes/importer/sources/wordpress/item/tidy.js @@ -0,0 +1,90 @@ +var insert_video_embeds = require("../../../helper").insert_video_embeds; +var debug = require("debug")("blot:importer:wordpress:tidy"); +var remove_inline_images = require("./remove_inline_images.js"); + +module.exports = function(entry, callback) { + var html = entry.html; + + html = fix_missing_p_tags(html); + html = remove_caption(html); + html = remove_embed(html); + html = remove_inline_images(html); + html = insert_video_embeds(html); + + entry.html = html; + + return callback(null, entry); +}; + +function remove_caption(html) { + while (html.indexOf("[caption") > -1) { + var opening_index = html.indexOf("[caption"); + var remainder = html.slice(opening_index); + var closing_index = remainder.indexOf("]"); + + html = + html.slice(0, opening_index) + + html.slice(opening_index + closing_index + 1); + html = html.split("[/caption]").join(""); + } + + return html; +} + +function remove_embed(html) { + while (html.indexOf("[embed") > -1) { + var opening_index = html.indexOf("[embed"); + var remainder = html.slice(opening_index); + var closing_index = remainder.indexOf("]"); + + html = + html.slice(0, opening_index) + + html.slice(opening_index + closing_index + 1); + html = html.split("[/embed]").join(""); + } + + return html; +} + +function fix_missing_p_tags(html) { + // HTML created by windows users contains /r instead of newlines + // which breaks the following code + html = html.split("\r").join("\n"); + + // Check for the closing tag instead of the + // opening tag to avoid matching

and

etc... + var has_p_tag = html.indexOf("

") > -1; + var doesnt_have_double_line_break = html.indexOf("\n\n") === -1; + + if (has_p_tag || doesnt_have_double_line_break) { + if (has_p_tag) + debug( + "Not interserting missing

tags into HTML because it already has p tags" + ); + if (doesnt_have_double_line_break) + debug( + "Not interserting missing

tags into HTML because it does not have double line breaks" + ); + + debug(JSON.stringify(html)); + + return html; + } + + // console.log('! Warning, replacing missing

tags.') + // console.log('---- BEFORE'); + // console.log(html); + // console.log('----'); + + html = html.split("\n\n"); + html = html.map(function(line) { + return "

" + line + "

"; + }); + html = html.join("\n\n"); + + // console.log('---- AFTER'); + // console.log(html); + // console.log('----'); + + return html; +} diff --git a/app/dashboard/routes/importer/sources/wordpress/item/tidy_HTML.js b/app/dashboard/routes/importer/sources/wordpress/item/tidy_HTML.js deleted file mode 100644 index 36f59da2053..00000000000 --- a/app/dashboard/routes/importer/sources/wordpress/item/tidy_HTML.js +++ /dev/null @@ -1,51 +0,0 @@ -var insert_video_embeds = require('../../../helper').insert_video_embeds; - -module.exports = function(entry, callback) { - - entry.html = fix_missing_p_tags(entry.html); - entry.html = remove_caption(entry.html); - entry.html = insert_video_embeds(entry.html); - - callback(null, entry); -}; - -function remove_caption(html) { - while (html.indexOf("[caption") > -1) { - var opening_index = html.indexOf("[caption"); - var remainder = html.slice(opening_index); - var closing_index = remainder.indexOf("]"); - - html = - html.slice(0, opening_index) + - html.slice(opening_index + closing_index + 1); - html = html.split("[/caption]").join(""); - } - - return html; -} - -function fix_missing_p_tags(html) { - // Check for the closing tag instead of the - // opening tag to avoid matching

and

etc... - var has_p_tag = html.indexOf("

") > -1; - var doesnt_have_double_line_break = html.indexOf("\n\n") === -1; - - if (has_p_tag || doesnt_have_double_line_break) return html; - - // console.log('! Warning, replacing missing

tags.') - // console.log('---- BEFORE'); - // console.log(html); - // console.log('----'); - - html = html.split("\n\n"); - html = html.map(function(line) { - return "

" + line + "

"; - }); - html = html.join("\n\n"); - - // console.log('---- AFTER'); - // console.log(html); - // console.log('----'); - - return html; -}