Skip to content

Commit

Permalink
Detect encoding mismatch.
Browse files Browse the repository at this point in the history
  • Loading branch information
pipacs committed May 9, 2012
1 parent b2aa99a commit 0ff8709
Showing 1 changed file with 30 additions and 1 deletion.
31 changes: 30 additions & 1 deletion backend/book.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,12 @@ int Book::partFromUrl(const QString &url) {
}

void Book::fixEncodings() {
const qint64 MaxSample = 25 * 80;

foreach (QString key, content_.keys()) {
QString xmlEncoding;
QString htmlEncoding;

// Don't touch non-HTML content items
if (content_[key].mediaType != QString("application/xhtml+xml")) {
continue;
Expand All @@ -680,7 +685,31 @@ void Book::fixEncodings() {
qWarning() << "Book::fixEncodings: Part" << fileName << "doesn't exist";
continue;
}
if (!file.open(QIODevice::ReadOnly)) {
qWarning() << "Book::fixEncoding: Could not open" << fileName;
return;
}
QString header = QString::fromUtf8(file.read(MaxSample).data()).toLower();
file.close();

// Get XML encoding
// <?xml version="1.0" encoding="UTF-8"?>
QRegExp xmlMeta("<\\?xml\\s+.*encoding\\s*=\\s*[\"'](.*)[\"']\\s*\\?>");
xmlMeta.setMinimal(true);
if (xmlMeta.indexIn(header) != -1) {
xmlEncoding = xmlMeta.cap(1);
}

// FIXME: Read and compare XML and HTML meta-data encoding values
// Get HTML encoding
// <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
QRegExp htmlMeta("<meta\\s+http-equiv\\s*=\\s*[\"']content-type[\"']\\s*content\\s*=\\s*[\"'].*;\\s*charset=(.*)[\"']\\s*/?>");
htmlMeta.setMinimal(true);
if (htmlMeta.indexIn(header) != -1) {
htmlEncoding = htmlMeta.cap(1);
}

if (!htmlEncoding.isEmpty() && (htmlEncoding != xmlEncoding)) {
qWarning() << "Book::fixEncoding: Encoding mismatch in" << content_[key].href << ": XML" << xmlEncoding << "HTML" << htmlEncoding;
}
}
}

0 comments on commit 0ff8709

Please sign in to comment.