Skip to content

Commit

Permalink
Item2454: simplified transcoding
Browse files Browse the repository at this point in the history
git-svn-id: http://svn.foswiki.org/trunk/StringifierContrib@9701 0b4bb1d4-4e5a-0410-9cc4-b2b747904278
  • Loading branch information
MichaelDaum authored and MichaelDaum committed Oct 24, 2010
1 parent 8eb2cef commit a2a72fa
Show file tree
Hide file tree
Showing 24 changed files with 170 additions and 220 deletions.
5 changes: 3 additions & 2 deletions lib/Foswiki/Contrib/StringifierContrib.pm
Expand Up @@ -33,12 +33,13 @@ $magic = File::MMagic->new();

sub stringFor {
my ($class, $filename, $encoding) = @_;

return unless -r $filename;
my $mime = $magic->checktype_filename($filename);

#print STDERR "filename=$filename, mime=$mime\n";
my $self = $class->handler_for($filename, $mime)->new();

#print STDERR "file $filename is a $mime ... using $self\n";

return $self->stringForFile($filename);
}

Expand Down
12 changes: 12 additions & 0 deletions lib/Foswiki/Contrib/StringifierContrib/Base.pm
Expand Up @@ -14,6 +14,8 @@

package Foswiki::Contrib::StringifierContrib::Base;
use strict;
use utf8;
use Encode ();

use Module::Pluggable (require => 1, search_path => [qw/Foswiki::Contrib::StringifierContrib::Plugins/]);

Expand Down Expand Up @@ -89,4 +91,14 @@ sub rmtree {
return 1;
}

sub fromUtf8 {
my ( $self, $text ) = @_;

$text = Encode::encode( "iso-8859-15", $text, 0 ) if utf8::is_utf8($text);
$text =~ s/^\s+//;
$text =~ s/\s+$//;

return $text;
}

1;
5 changes: 1 addition & 4 deletions lib/Foswiki/Contrib/StringifierContrib/DEPENDENCIES
@@ -1,15 +1,12 @@
File::MMagic,>0,cpan,Required
Module::Pluggable,>0,cpan,Required
HTML::TreeBuilder,>0,cpan,Required
Spreadsheet::ParseExcel,>0,cpan,Required for =.xls= files
Spreadsheet::XLSX,>0,cpan,Required for =.xlsx= files
CharsetDetector,>0,cpan,Required
Encode,>0,cpan,Required
Error,>0,cpan,Required
ppthtml,>0,c,Required for indexing =.ppt= files. Part of xlhtml
pdftotext,>0,c,Required for indexing =.pdf=. Part of xpdf-utils
antiword,>0,c,One of antiword, abiword or wvWare is required for =.doc= files
abiword,>0,c,One of antiword, abiword or wvWare is required for =.doc= files
wvWare,>0,c,One of antiword, abiword or wvWare is required for =.doc= files
docx2txt,>0,perl,Required for =.docx= files. Available from http://sourceforge.net/projects/docx2txt/
pptx2txt,>0,perl,Required for =.pptx= files. Available from http://sourceforge.net/projects/pptx2txt/
html2text,>0,c,Required for indexing html files
51 changes: 51 additions & 0 deletions lib/Foswiki/Contrib/StringifierContrib/MANIFEST
Expand Up @@ -13,6 +13,57 @@ lib/Foswiki/Contrib/StringifierContrib/Plugins/Text.pm 0644
lib/Foswiki/Contrib/StringifierContrib/Plugins/XLS.pm 0644
lib/Foswiki/Contrib/StringifierContrib/Plugins/XLSX.pm 0644
lib/Foswiki/Contrib/StringifierContrib.pm 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.doc 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.docx 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.html 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.pdf 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.ppt 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.pptx 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.txt 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.xls 0644
test/unit/StringifierContrib/attachement_examples/Im_a_png.xlsx 0644
test/unit/StringifierContrib/attachement_examples/Passworded_example.doc 0644
test/unit/StringifierContrib/attachement_examples/Passworded_example.docx 0644
test/unit/StringifierContrib/attachement_examples/Passworded_example.ppt 0644
test/unit/StringifierContrib/attachement_examples/Passworded_example.pptx 0644
test/unit/StringifierContrib/attachement_examples/Passworded_example.xls 0644
test/unit/StringifierContrib/attachement_examples/Passworded_example.xlsx 0644
test/unit/StringifierContrib/attachement_examples/Portuguese_example.xls 0644
test/unit/StringifierContrib/attachement_examples/Portuguese_example.xlsx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example2.doc 0644
test/unit/StringifierContrib/attachement_examples/Simple_example2.docx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.doc 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.docm 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.docx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.dotm 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.dotx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.html 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.pdf 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.potm 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.potx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.ppsx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.ppt 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.pptx 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.txt 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.xls 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.xlsb 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.xlsm 0644
test/unit/StringifierContrib/attachement_examples/Simple_example.xlsx 0644
test/unit/StringifierContrib/Doc_abiwordTests.pm 0644
test/unit/StringifierContrib/Doc_antiwordTests.pm 0644
test/unit/StringifierContrib/Doc_wvTests.pm 0644
test/unit/StringifierContrib/DocxTests.pm 0644
test/unit/StringifierContrib/HtmlTests.pm 0644
test/unit/StringifierContrib/PdfTests.pm 0644
test/unit/StringifierContrib/PptTests.pm 0644
test/unit/StringifierContrib/PptxTests.pm 0644
test/unit/StringifierContrib/StringifierContribSuite.pm 0644
test/unit/StringifierContrib/StringifyBaseTest.pm 0644
test/unit/StringifierContrib/tree_example/sub_tree/test_file.txt 0644
test/unit/StringifierContrib/tree_example/test_file.txt 0644
test/unit/StringifierContrib/TxtTests.pm 0644
test/unit/StringifierContrib/XlsTests.pm 0644
test/unit/StringifierContrib/XlsxTests.pm 0644
tools/docx2txt.pl 0755
tools/pptx2txt.pl 0755
tools/stringify 0755
Expand Down
23 changes: 3 additions & 20 deletions lib/Foswiki/Contrib/StringifierContrib/Plugins/DOCX.pm
Expand Up @@ -16,9 +16,6 @@
package Foswiki::Contrib::StringifierContrib::Plugins::DOCX;
use Foswiki::Contrib::StringifierContrib::Base;
our @ISA = qw( Foswiki::Contrib::StringifierContrib::Base );
use File::Temp qw/tmpnam/;
use Encode;
use CharsetDetector;

my $docx2txt = $Foswiki::cfg{StringifierContrib}{docx2txtCmd} || 'docx2txt.pl';

Expand All @@ -31,24 +28,10 @@ sub stringForFile {
my ($self, $filename) = @_;

my $cmd = $docx2txt . ' %FILENAME|F% -';
my ($output, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);
my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

return '' unless ($exit == 0);

# encode text
my $text = "";
foreach( split( "\n", $output ) ){
my $charset = CharsetDetector::detect1($_);
my $aux_text = "";
if ($charset =~ "utf") {
$aux_text = encode("iso-8859-15", decode($charset, $_));
$aux_text = $_ unless($aux_text);
} else {
$aux_text = $_;
}
$text .= "\n" . $aux_text;
}
return $text;
return $self->fromUtf8($text);
}

1;
Expand Up @@ -16,7 +16,6 @@ package Foswiki::Contrib::StringifierContrib::Plugins::DOC_abiword;
use Foswiki::Contrib::StringifierContrib::Base;
our @ISA = qw( Foswiki::Contrib::StringifierContrib::Base );
use File::Temp qw/tmpnam/;
use Encode;
use Foswiki;

my $abiword = $Foswiki::cfg{StringifierContrib}{abiwordCmd} || 'abiword';
Expand Down
20 changes: 4 additions & 16 deletions lib/Foswiki/Contrib/StringifierContrib/Plugins/DOC_antiword.pm
Expand Up @@ -15,9 +15,6 @@
package Foswiki::Contrib::StringifierContrib::Plugins::DOC_antiword;
use Foswiki::Contrib::StringifierContrib::Base;
our @ISA = qw( Foswiki::Contrib::StringifierContrib::Base );
use File::Temp qw/tmpnam/;
use Encode;
use CharsetDetector;

my $antiword = $Foswiki::cfg{StringifierContrib}{antiwordCmd} || 'antiword';

Expand All @@ -33,23 +30,14 @@ sub stringForFile {
my ($self, $file) = @_;

my $cmd = $antiword . ' %FILENAME|F%';
my ($output, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $file);
my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $file);

return '' unless ($exit == 0);

# encode text
my $text = "";
foreach( split( "\n", $output ) ){
my $charset = CharsetDetector::detect1($_);
my $aux_text = "";
if ($charset =~ "utf") {
$aux_text = encode("iso-8859-15", decode($charset, $_));
$aux_text = $_ unless($aux_text);
} else {
$aux_text = $_;
}
$text .= "\n" . $aux_text;
}
$text = $self->fromUtf8($text);
$text =~ s/\n\s*?\n/\n/g;

return $text;
}

Expand Down
1 change: 1 addition & 0 deletions lib/Foswiki/Contrib/StringifierContrib/Plugins/DOC_wv.pm
Expand Up @@ -45,6 +45,7 @@ sub stringForFile {
$text = Foswiki::Contrib::StringifierContrib->stringFor($tmp_file);

# Deletes temp files (main html and images)
unlink($tmp_file);
$self->rmtree($tmp_dir);

return $text;
Expand Down
56 changes: 8 additions & 48 deletions lib/Foswiki/Contrib/StringifierContrib/Plugins/HTML.pm
Expand Up @@ -16,8 +16,8 @@
package Foswiki::Contrib::StringifierContrib::Plugins::HTML;
use Foswiki::Contrib::StringifierContrib::Base;
our @ISA = qw( Foswiki::Contrib::StringifierContrib::Base );
use Encode;
use CharsetDetector;

my $html2text = $Foswiki::cfg{StringifierContrib}{html2text} || 'html2text';

__PACKAGE__->register_handler("text/html", ".html");

Expand All @@ -26,53 +26,13 @@ sub stringForFile {

# check it is a text file
return '' unless ( -T $filename );

try {
use HTML::TreeBuilder;
} catch Error with {
return '';
}

my $tree = HTML::TreeBuilder->new;
open(my $fh, "<", $filename) || return "";

my $text = "";
while (<$fh>) {
my $aux_text = "";
my $charset = CharsetDetector::detect1($_);
if ($charset =~ "utf") {
$aux_text = encode("iso-8859-15", decode("utf-8", $_));
$aux_text = $_ unless (defined($aux_text));
} else {
$aux_text = $_;
}

$text .= $aux_text;
}
close($fh);

$tree->parse($text);

$text = "";
for($tree->look_down(_tag => "meta")) {
next if $_->attr("http-equiv");
next unless $_->attr("value");

$text .= $_->attr("value");
$text .= " ";
}
for (@{$tree->extract_links("a")}) {

$text .= $_->[0];
$text .= " ";
}

$text .= $tree->as_text;
$text = encode("iso-8859-15", $text);

$tree->delete();

return $text;
my $cmd = $html2text . ' -ascii %FILENAME|F%';
my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

# encode text
$text =~ s/<\?xml.*?\?>\s*//g;
return $self->fromUtf8($text);
}

1;
21 changes: 2 additions & 19 deletions lib/Foswiki/Contrib/StringifierContrib/Plugins/PPTX.pm
Expand Up @@ -16,9 +16,6 @@
package Foswiki::Contrib::StringifierContrib::Plugins::PPTX;
use Foswiki::Contrib::StringifierContrib::Base;
our @ISA = qw( Foswiki::Contrib::StringifierContrib::Base );
use File::Temp qw/tmpnam/;
use Encode;
use CharsetDetector;

my $pptx2txt = $Foswiki::cfg{StringifierContrib}{pptx2txtCmd} || 'pptx2txt.pl';

Expand All @@ -31,24 +28,10 @@ sub stringForFile {
my ($self, $filename) = @_;

my $cmd = $pptx2txt . ' %FILENAME|F% -';
my ($output, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);
my ($text, $exit) = Foswiki::Sandbox->sysCommand($cmd, FILENAME => $filename);

return '' unless ($exit == 0);

# encode text
my $text = "";
foreach( split( "\n", $output ) ){
my $charset = CharsetDetector::detect1($_);
my $aux_text = "";
if ($charset =~ "utf") {
$aux_text = encode("iso-8859-15", decode($charset, $_));
$aux_text = $_ unless($aux_text);
} else {
$aux_text = $_;
}
$text .= "\n" . $aux_text;
}
return $text;
return $self->fromUtf8($text);
}

1;
33 changes: 10 additions & 23 deletions lib/Foswiki/Contrib/StringifierContrib/Plugins/Text.pm
Expand Up @@ -15,35 +15,22 @@
package Foswiki::Contrib::StringifierContrib::Plugins::Text;
use Foswiki::Contrib::StringifierContrib::Base;
our @ISA = qw( Foswiki::Contrib::StringifierContrib::Base );
use Encode;
use CharsetDetector;
use Encode ();

# Note: I need not do any register, because I am the default handler for stringification!

sub stringForFile {
my ($self, $file) = @_;
my $in;
my ( $self, $file ) = @_;
my $in;

# check it is a text file
return '' unless (-T $file);
# check it is a text file
return '' unless ( -T $file );

open $in, $file or return "";
open $in, $file or return "";
local $/ = undef; # set to read to EOF
my $text = <$in>;
close($in);

my $text = "";
while (<$in>) {
my $charset = CharsetDetector::detect1($_);
my $aux_text = "";
if ($charset =~ "utf") {
$aux_text = encode("iso-8859-15", decode($charset, $_));
$aux_text = $_ unless ($aux_text);
} else {
$aux_text = $_;
}
$text .= $aux_text;
}

close($in);

return $text;
return $self->fromUtf8($text);
}
1;

0 comments on commit a2a72fa

Please sign in to comment.