Skip to content

Commit

Permalink
Item14484: make html2text converter configurable
Browse files Browse the repository at this point in the history
also:

* properly create temp files using File::Temp
* improved XSLX conversion either using a perl script, xlsx2csv or soffice
* fixed PPT converter
  • Loading branch information
MichaelDaum committed Sep 18, 2017
1 parent 72b121e commit dfabdfc
Show file tree
Hide file tree
Showing 23 changed files with 408 additions and 134 deletions.
29 changes: 16 additions & 13 deletions data/System/StringifierContrib.txt
Expand Up @@ -10,27 +10,28 @@ by full-text search engines such as Foswiki:Extensions/KinoSearchContrib or Fosw

It supports all major office document formats such as:

* =.html=
* =.xml=
* =.doc=
* =.docx=
* =.xls=
* =.xlsx=
* =.ppt=
* =.pptx=
* =.pdf=
* =.odt=
* =.ott=
* =.html=
* =.odp=
* =.otp=
* =.ods=
* =.odt=
* =.otp=
* =.ots=
* =.sxw=
* =.ott=
* =.pdf=
* =.ppt=
* =.pptx=
* =.stc=
* =.sti=
* =.stw=
* =.sxc=
* =.stc=
* =.sxi=
* =.sti=
* =.sxw=
* =.txt=
* =.xls=
* =.xlsx=
* =.xml=

%TOPIC% is organized in plugins to serialization a document format by delegating it to according backends.
For some formats there are alternative backends to chose from. For example a DOC file can be serialized
Expand Down Expand Up @@ -139,6 +140,8 @@ Foswiki:Development/UnitTests for more information on unit testing.
%$DEPENDENCIES%

---++ Change History
| 18 Sep 2017: | (5.00) make html-to-text converter pluggable |
| 31 Jan 2017: | (4.40) improved XLSX stringifier |
| 23 Jan 2017: | (4.30) added stringifier to index XLS using soffice |
| 18 Oct 2015: | (4.20) removed dependency on File::MMagic; now using extension-based mime detection |
| 01 Oct 2015: | (4.10) don't default to pass-through for non-supported document types; fixed unit tests |
Expand Down
14 changes: 8 additions & 6 deletions lib/Foswiki/Contrib/Stringifier.pm
Expand Up @@ -44,20 +44,22 @@ sub _getMimeType {
return $mimeType;
}



sub stringFor {
my ($class, $filename) = @_;
my ($class, $filename, $mime) = @_;

return unless -r $filename;
my $mime = _getMimeType($filename);
$mime = _getMimeType($filename) unless defined $mime;

#print STDERR "no mime for $filename\n" unless $mime;
if ($Foswiki::cfg{StringifierContrib}{Debug}) {
print STDERR "StringifierContrib - no mime for $filename\n" unless $mime;
}
return unless $mime;

my $impl = $class->handler_for($filename, $mime);

#print STDERR "file $filename is a $mime ... using ".($impl||'undef')."\n";
if ($Foswiki::cfg{StringifierContrib}{Debug}) {
print STDERR "file $filename is a $mime ... using ".($impl||'undef')."\n";
}
return unless $impl;

my $plugin = $impl->new();
Expand Down
13 changes: 7 additions & 6 deletions lib/Foswiki/Contrib/Stringifier/Base.pm
Expand Up @@ -39,7 +39,8 @@ __PACKAGE__->plugins;
}
}
sub handler_for {
my ($self, $filename, $mime) = @_;
my ($this, $filename, $mime) = @_;

if (exists $mime_handlers{$mime}) { return $mime_handlers{$mime} }
$filename = lc($filename);
for my $spec (keys %extension_handlers) {
Expand All @@ -52,7 +53,7 @@ __PACKAGE__->plugins;
# This is as service method that a sub calss can use to decise,
# if it wants to register or not.
sub _programExists {
my ($self, $program) = @_;
my ($this, $program) = @_;

# work around a bug in old File::Which that doesn't like absolute paths
return $program if -f $program;
Expand All @@ -64,20 +65,20 @@ __PACKAGE__->plugins;

sub new {
my ($handler) = @_;
my $self = bless {}, $handler;
my $this = bless {}, $handler;

$self;
return $this;
}

sub decode {
my ( $self, $string, $charSet ) = @_;
my ( $this, $string, $charSet ) = @_;

$charSet ||= $Foswiki::cfg{Site}{CharSet};
return Encode::decode( $charSet, $string );
}

sub encode {
my ( $self, $string, $charSet ) = @_;
my ( $this, $string, $charSet ) = @_;

$charSet ||= $Foswiki::cfg{Site}{CharSet};
return Encode::encode( $charSet, $string );
Expand Down
8 changes: 4 additions & 4 deletions lib/Foswiki/Contrib/Stringifier/Plugins/DOC_abiword.pm
Expand Up @@ -19,7 +19,7 @@ use warnings;

use Foswiki::Contrib::Stringifier::Base ();
our @ISA = qw( Foswiki::Contrib::Stringifier::Base );
use File::Temp qw/tmpnam/;
use File::Temp ();

my $abiword = $Foswiki::cfg{StringifierContrib}{abiwordCmd} || 'abiword';

Expand All @@ -34,18 +34,18 @@ if (defined($Foswiki::cfg{StringifierContrib}{WordIndexer}) &&

sub stringForFile {
my ($self, $file) = @_;
my $tmp_file = tmpnam() . ".txt";
my $tmpFile = File::Temp->new(SUFFIX =>".txt");

my $cmd = $abiword . ' --to=%TMPFILE|F% %FILENAME|F%';
my ($output, $exit, $error) = Foswiki::Sandbox->sysCommand($cmd, TMPFILE => $tmp_file, FILENAME => $file);
my ($output, $exit, $error) = Foswiki::Sandbox->sysCommand($cmd, TMPFILE => $tmpFile->filename, FILENAME => $file);

if ($exit) {
print STDERR "ERROR: $abiword returned with code $exit - $error\n";
return "";
}

my $in;
open($in, $tmp_file) or return "";
open($in, $tmpFile) or return "";
local $/ = undef; # set to read to EOF
my $text = <$in>;
close($in);
Expand Down
1 change: 0 additions & 1 deletion lib/Foswiki/Contrib/Stringifier/Plugins/DOC_catdoc.pm
Expand Up @@ -21,7 +21,6 @@ use Foswiki::Contrib::Stringifier::Base ();
use Foswiki::Contrib::Stringifier ();

our @ISA = qw( Foswiki::Contrib::Stringifier::Base );
use File::Temp qw/tmpnam/;

my $catdoc = $Foswiki::cfg{StringifierContrib}{catdocCmd} || 'catdoc';

Expand Down
10 changes: 4 additions & 6 deletions lib/Foswiki/Contrib/Stringifier/Plugins/DOC_wv.pm
Expand Up @@ -21,7 +21,7 @@ use Foswiki::Contrib::Stringifier::Base ();
use Foswiki::Contrib::Stringifier ();

our @ISA = qw( Foswiki::Contrib::Stringifier::Base );
use File::Temp qw/tmpnam/;
use File::Temp ();

my $wvText = $Foswiki::cfg{StringifierContrib}{wvTextCmd} || 'wvText';

Expand All @@ -37,21 +37,19 @@ if (defined($Foswiki::cfg{StringifierContrib}{WordIndexer}) &&
sub stringForFile {
my ($self, $file) = @_;

my $tmp_file = tmpnam() . ".txt";
my $tmpFile = File::Temp->new(SURFIX => ".txt");

my $cmd = $wvText . ' %FILENAME|F% %TMPFILE|F%';
my ($output, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $file, TMPFILE => $tmp_file);
my ($output, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $file, TMPFILE => $tmpFile->filename);

return '' unless ($exit == 0);

my $in;
open($in, $tmp_file) or return "";
open($in, $tmpFile) or return "";
local $/ = undef; # set to read to EOF
my $text = <$in>;
close($in);

unlink($tmp_file);

$text = $self->decode($text);
$text =~ s/^\s+|\s+$//g;

Expand Down
49 changes: 49 additions & 0 deletions lib/Foswiki/Contrib/Stringifier/Plugins/HTML_html2text.pm
@@ -0,0 +1,49 @@
# Copyright (C) 2009-2017 Foswiki Contributors
#
# For licensing info read LICENSE file in the Foswiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details, published at
# http://www.gnu.org/copyleft/gpl.html

package Foswiki::Contrib::Stringifier::Plugins::HTML_html2text;

use strict;
use warnings;

use Foswiki::Contrib::Stringifier::Base ();
our @ISA = qw( Foswiki::Contrib::Stringifier::Base );

my $html2textCmd = $Foswiki::cfg{StringifierContrib}{html2textCmd} || 'html2text';

if ((!defined($Foswiki::cfg{StringifierContrib}{HtmlIndexer}) || $Foswiki::cfg{StringifierContrib}{HtmlIndexer} eq 'html2text')
&& __PACKAGE__->_programExists($html2textCmd))
{
__PACKAGE__->register_handler("text/html", ".html");
}

sub stringForFile {
my ($self, $filename) = @_;

# check it is a text file
return '' unless ( -e $filename );

my $cmd = $html2textCmd;
$cmd .= " -nobs -utf8 %FILENAME|F%" unless $cmd =~ /%FILENAME\|F%/;

my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

$text = $self->decode($text);
$text =~ s/<\?xml.*?\?>\s*//g;
$text =~ s/^\s+|\s+$//g;

return $text;
}

1;
49 changes: 49 additions & 0 deletions lib/Foswiki/Contrib/Stringifier/Plugins/HTML_links.pm
@@ -0,0 +1,49 @@
# Copyright (C) 2009-2017 Foswiki Contributors
#
# For licensing info read LICENSE file in the Foswiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details, published at
# http://www.gnu.org/copyleft/gpl.html

package Foswiki::Contrib::Stringifier::Plugins::HTML_links;

use strict;
use warnings;

use Foswiki::Contrib::Stringifier::Base ();
our @ISA = qw( Foswiki::Contrib::Stringifier::Base );

my $linksCmd = $Foswiki::cfg{StringifierContrib}{linksCmd} || 'links';

if (defined($Foswiki::cfg{StringifierContrib}{HtmlIndexer}) && $Foswiki::cfg{StringifierContrib}{HtmlIndexer} eq 'links'
&& __PACKAGE__->_programExists($linksCmd))
{
__PACKAGE__->register_handler("text/html", ".html");
}

sub stringForFile {
my ($self, $filename) = @_;

# check it is a text file
return '' unless ( -e $filename );

my $cmd = $linksCmd;
$cmd .= " -dump %FILENAME|F%" unless $cmd =~ /%FILENAME\|F%/;

my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

$text = $self->decode($text);
$text =~ s/<\?xml.*?\?>\s*//g;
$text =~ s/^\s+|\s+$//g;

return $text;
}

1;
49 changes: 49 additions & 0 deletions lib/Foswiki/Contrib/Stringifier/Plugins/HTML_lynx.pm
@@ -0,0 +1,49 @@
# Copyright (C) 2009-2017 Foswiki Contributors
#
# For licensing info read LICENSE file in the Foswiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details, published at
# http://www.gnu.org/copyleft/gpl.html

package Foswiki::Contrib::Stringifier::Plugins::HTML_lynx;

use strict;
use warnings;

use Foswiki::Contrib::Stringifier::Base ();
our @ISA = qw( Foswiki::Contrib::Stringifier::Base );

my $lynxCmd = $Foswiki::cfg{StringifierContrib}{lynxCmd} || 'lynx';

if (defined($Foswiki::cfg{StringifierContrib}{HtmlIndexer}) && $Foswiki::cfg{StringifierContrib}{HtmlIndexer} eq 'lynx'
&& __PACKAGE__->_programExists($lynxCmd))
{
__PACKAGE__->register_handler("text/html", ".html");
}

sub stringForFile {
my ($self, $filename) = @_;

# check it is a text file
return '' unless ( -e $filename );

my $cmd = $lynxCmd;
$cmd .= " -dump %FILENAME|F%" unless $cmd =~ /%FILENAME\|F%/;

my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

$text = $self->decode($text);
$text =~ s/<\?xml.*?\?>\s*//g;
$text =~ s/^\s+|\s+$//g;

return $text;
}

1;
Expand Up @@ -12,25 +12,31 @@
# GNU General Public License for more details, published at
# http://www.gnu.org/copyleft/gpl.html

package Foswiki::Contrib::Stringifier::Plugins::HTML;
package Foswiki::Contrib::Stringifier::Plugins::HTML_w3m;

use strict;
use warnings;

use Foswiki::Contrib::Stringifier::Base ();
our @ISA = qw( Foswiki::Contrib::Stringifier::Base );

my $html2text = $Foswiki::cfg{StringifierContrib}{htmltotextCmd} || 'html2text';
my $w3mCmd = $Foswiki::cfg{StringifierContrib}{w3mCmd} || 'w3m';

__PACKAGE__->register_handler("text/html", ".html");
if (defined($Foswiki::cfg{StringifierContrib}{HtmlIndexer}) && $Foswiki::cfg{StringifierContrib}{HtmlIndexer} eq 'w3m'
&& __PACKAGE__->_programExists($w3mCmd))
{
__PACKAGE__->register_handler("text/html", ".html");
}

sub stringForFile {
my ($self, $filename) = @_;

# check it is a text file
return '' unless ( -e $filename );

my $cmd = $html2text . ' -nobs %FILENAME|F%';
my $cmd = $w3mCmd;
$cmd .= " -dump %FILENAME|F%" unless $cmd =~ /%FILENAME\|F%/;

my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

$text = $self->decode($text);
Expand Down

0 comments on commit dfabdfc

Please sign in to comment.