Skip to content

Commit

Permalink
Improved Wordpress importer
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmerfield committed Feb 14, 2019
1 parent c2ce7d8 commit 6eea118
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 99 deletions.
82 changes: 42 additions & 40 deletions app/dashboard/routes/importer/sources/wordpress/index.js
Expand Up @@ -7,23 +7,25 @@ var log = require("single-line-log").stdout;
var Item = require("./item");

if (require.main === module) {
var args = process.argv.slice(2);
var outputDirectory = args.pop();
var sourceFiles = args;
var options = {};
var outputDirectory = process.argv[3];
var sourceFile = process.argv[2];

if (!outputDirectory || !sourceFiles.length) {
options.filter = process.argv[4];

if (!outputDirectory || !sourceFile.length) {
console.log(
"Please pass XML export file(s) to convert and directory to output result:"
"Please pass XML export file to convert and directory to output result:"
);
return console.log(
"node index.js export.xml [other-export.xml] output-directory"
"node index.js export.xml output-directory [filter-item-by-title]"
);
}

console.log(colors.dim("Starting Wordpress import from"), sourceFiles);
console.log(colors.dim("Starting Wordpress import from"), sourceFile);
console.log(colors.dim("Output directory:"), outputDirectory);

main(sourceFiles, outputDirectory, function(err) {
main(sourceFile, outputDirectory, options, function(err) {
if (err) throw err;

console.log();
Expand All @@ -32,45 +34,45 @@ if (require.main === module) {
});
}

function main(sourceFiles, outputDirectory, callback) {
function main(sourceFile, outputDirectory, options, callback) {
fs.emptyDirSync(outputDirectory);

async.eachSeries(
sourceFiles,
function(sourceFile, next) {
fs.readFile(sourceFile, "utf-8", function(err, xml) {
if (err) return next(err);
fs.readFile(sourceFile, "utf-8", function(err, xml) {
if (err) return callback(err);

parseXML(xml, function(err, result) {
if (err) return next(err);
parseXML(xml, function(err, result) {
if (err) return callback(err);

console.log();
console.log(result.rss.channel[0].title[0]);
console.log(colors.dim("Site URL:"), result.rss.channel[0].link[0]);
console.log(
colors.dim("Export Version"),
result.rss.channel[0]["wp:wxr_version"][0]
);
console.log();
console.log(result.rss.channel[0].title[0]);
console.log(colors.dim("Site URL:"), result.rss.channel[0].link[0]);
console.log(
colors.dim("Export Version"),
result.rss.channel[0]["wp:wxr_version"][0]
);

// If you want to see other properties available,
// log this to STDOUT
// console.log(result.rss.channel);
// If you want to see other properties available,
// log this to STDOUT
// console.log(result.rss.channel);

var totalItems = result.rss.channel[0].item.length;

async.eachOfSeries(
result.rss.channel[0].item,
function(item, index, done) {
log(colors.dim(++index + "/" + totalItems), item.title[0]);
Item(item, outputDirectory, done);
},
next
);
if (options.filter) {
result.rss.channel[0].item = result.rss.channel[0].item.filter(function(item){
return item.title[0].toLowerCase().indexOf(options.filter) > -1;
});
});
},
callback
);
}

var totalItems = result.rss.channel[0].item.length;

async.eachOfSeries(
result.rss.channel[0].item,
function(item, index, done) {
log(colors.dim(++index + "/" + totalItems), item.title[0]);
Item(item, outputDirectory, done);
},
callback
);
});
});
}

module.exports = main;
@@ -1,6 +1,23 @@
var Turndown = require("turndown");
var turndown = new Turndown();
var debug = require('debug')('blot:importer:wordpress:markdown');

module.exports = function(entry, callback) {

entry.content = require("../../../helper").to_markdown(entry.html);

debug();
debug();
debug('Input HTML:');
debug();
debug(entry.html);

entry.content = turndown.turndown(entry.html);

entry.content = entry.content.trim();

debug();
debug();
debug('Result:');
debug();
debug(entry.content);

callback(null, entry);
};
Expand Up @@ -3,6 +3,8 @@ var moment = require("moment");
var join = require("path").join;
var helper = require("../../../helper");
var determine_path = helper.determine_path;
var tidy = require("./tidy");

module.exports = function(item, output_directory) {
return function(callback) {
var entry = {};
Expand All @@ -29,9 +31,11 @@ module.exports = function(item, output_directory) {
determine_path(entry.title, entry.page, entry.draft, entry.dateStamp)
);

entry.tags = item.category.map(function(category) {
return category._;
});
if (item.category) {
entry.tags = item.category.map(function(category) {
return category._;
});
}

callback(null, entry);
};
Expand Down
4 changes: 2 additions & 2 deletions app/dashboard/routes/importer/sources/wordpress/item/index.js
@@ -1,8 +1,8 @@
var async = require("async");
var helper = require("../../../helper");
var extract_entry = require("./extract_entry");
var tidy_HTML = require("./tidy_HTML");
var convert_to_markdown = require("./convert_to_markdown");
var tidy = require('./tidy');

module.exports = function(item, output_directory, callback) {

Expand All @@ -18,7 +18,7 @@ module.exports = function(item, output_directory, callback) {
async.waterfall(
[
extract_entry(item, output_directory),
tidy_HTML,
tidy,
helper.download_pdfs,
helper.download_images,
convert_to_markdown,
Expand Down
@@ -0,0 +1,28 @@
var debug = require("debug")("blot:importer:wordpress:tidy_caption");
var cheerio = require("cheerio");

module.exports = function(html) {
var $ = cheerio.load(html);

debug($.html());

$("p")
.filter(function() {
return $(this).find("img").length;
})
.each(function(i, p) {
if (!$(this).text()) return;

$(this)
.find("a img")
.each(function(i, aWithImg) {
$('<p>' + $.html(aWithImg) + '</p>').insertBefore(p);
$(aWithImg).remove();
});

});

debug($.html());

return $.html();
};
90 changes: 90 additions & 0 deletions app/dashboard/routes/importer/sources/wordpress/item/tidy.js
@@ -0,0 +1,90 @@
var insert_video_embeds = require("../../../helper").insert_video_embeds;
var debug = require("debug")("blot:importer:wordpress:tidy");
var remove_inline_images = require("./remove_inline_images.js");

module.exports = function(entry, callback) {
var html = entry.html;

html = fix_missing_p_tags(html);
html = remove_caption(html);
html = remove_embed(html);
html = remove_inline_images(html);
html = insert_video_embeds(html);

entry.html = html;

return callback(null, entry);
};

function remove_caption(html) {
while (html.indexOf("[caption") > -1) {
var opening_index = html.indexOf("[caption");
var remainder = html.slice(opening_index);
var closing_index = remainder.indexOf("]");

html =
html.slice(0, opening_index) +
html.slice(opening_index + closing_index + 1);
html = html.split("[/caption]").join("");
}

return html;
}

function remove_embed(html) {
while (html.indexOf("[embed") > -1) {
var opening_index = html.indexOf("[embed");
var remainder = html.slice(opening_index);
var closing_index = remainder.indexOf("]");

html =
html.slice(0, opening_index) +
html.slice(opening_index + closing_index + 1);
html = html.split("[/embed]").join("");
}

return html;
}

function fix_missing_p_tags(html) {
// HTML created by windows users contains /r instead of newlines
// which breaks the following code
html = html.split("\r").join("\n");

// Check for the closing tag instead of the
// opening tag to avoid matching <p> and <p id="..."> etc...
var has_p_tag = html.indexOf("</p>") > -1;
var doesnt_have_double_line_break = html.indexOf("\n\n") === -1;

if (has_p_tag || doesnt_have_double_line_break) {
if (has_p_tag)
debug(
"Not interserting missing <p> tags into HTML because it already has p tags"
);
if (doesnt_have_double_line_break)
debug(
"Not interserting missing <p> tags into HTML because it does not have double line breaks"
);

debug(JSON.stringify(html));

return html;
}

// console.log('! Warning, replacing missing <p> tags.')
// console.log('---- BEFORE');
// console.log(html);
// console.log('----');

html = html.split("\n\n");
html = html.map(function(line) {
return "<p>" + line + "</p>";
});
html = html.join("\n\n");

// console.log('---- AFTER');
// console.log(html);
// console.log('----');

return html;
}
51 changes: 0 additions & 51 deletions app/dashboard/routes/importer/sources/wordpress/item/tidy_HTML.js

This file was deleted.

0 comments on commit 6eea118

Please sign in to comment.