Browse files

Initial release for CPAN

Original codebase by Sopan Shewale. Cleaned up for release
by Mike Doherty.
  • Loading branch information...
0 parents commit 3b03dee24e439416dd0c4b0ba7ec0600e890ab92 @doherty committed Nov 2, 2012
3 .gitignore
@@ -0,0 +1,3 @@
+.build
+_build
+Plucene-SearchEngine-Index-MSOffice-*
46 Build.PL
@@ -0,0 +1,46 @@
+
+use strict;
+use warnings;
+
+use Module::Build 0.3601;
+
+
+my %module_build_args = (
+ "build_requires" => {
+ "File::Find" => 0,
+ "Module::Build" => "0.3601",
+ "Test::More" => 0
+ },
+ "configure_requires" => {
+ "Module::Build" => "0.3601"
+ },
+ "dist_abstract" => "a set of Plucene backends for indexing Microsoft Office formats",
+ "dist_author" => [
+ "Sopan Shewale <sopan.shewale\@gmail.com>",
+ "Mike Doherty <doherty\@pythian.com>"
+ ],
+ "dist_name" => "Plucene-SearchEngine-Index-MSOffice",
+ "dist_version" => "0.001",
+ "license" => "perl",
+ "module_name" => "Plucene::SearchEngine::Index::MSOffice",
+ "recommends" => {},
+ "recursive_test_files" => 1,
+ "requires" => {
+ "File::Temp" => 0,
+ "IPC::Run3" => 0,
+ "Plucene::SearchEngine::Index::Base" => 0,
+ "Plucene::SearchEngine::Index::HTML" => 0,
+ "Plucene::SearchEngine::Index::Text" => 0,
+ "Spreadsheet::ParseExcel" => 0,
+ "parent" => 0,
+ "perl" => "5.006",
+ "strict" => 0,
+ "warnings" => 0
+ },
+ "script_files" => []
+);
+
+
+my $build = Module::Build->new(%module_build_args);
+
+$build->create_build_script;
5 Changes
@@ -0,0 +1,5 @@
+Revision history for Perl module {{$dist->name}}
+
+{{$NEXT}}
+ - First release to CPAN
+ - Original codebase by Sopan Shewale <sopan.shewale@gmail.com>
73 MANIFEST.SKIP
@@ -0,0 +1,73 @@
+
+#!start included /home/mike/perl5/perlbrew/perls/perl-5.16.1-threads/lib/5.16.1/ExtUtils/MANIFEST.SKIP
+# Avoid version control files.
+\bRCS\b
+\bCVS\b
+\bSCCS\b
+,v$
+\B\.svn\b
+\B\.git\b
+\B\.gitignore\b
+\b_darcs\b
+\B\.cvsignore$
+
+# Avoid VMS specific MakeMaker generated files
+\bDescrip.MMS$
+\bDESCRIP.MMS$
+\bdescrip.mms$
+
+# Avoid Makemaker generated and utility files.
+\bMANIFEST\.bak
+\bMakefile$
+\bblib/
+\bMakeMaker-\d
+\bpm_to_blib\.ts$
+\bpm_to_blib$
+\bblibdirs\.ts$ # 6.18 through 6.25 generated this
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\b_build/
+\bBuild.bat$
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
+
+# Avoid temp and backup files.
+~$
+\.old$
+\#$
+\b\.#
+\.bak$
+\.tmp$
+\.#
+\.rej$
+
+# Avoid OS-specific files/dirs
+# Mac OSX metadata
+\B\.DS_Store
+# Mac OSX SMB mount metadata files
+\B\._
+
+# Avoid Devel::Cover and Devel::CoverX::Covered files.
+\bcover_db\b
+\bcovered\b
+
+# Avoid MYMETA files
+^MYMETA\.
+#!end included /home/mike/perl5/perlbrew/perls/perl-5.16.1-threads/lib/5.16.1/ExtUtils/MANIFEST.SKIP
+
+# Avoid configuration metadata file
+^MYMETA\.
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\bBuild.bat$
+\b_build
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
+^MANIFEST\.SKIP
+
+# Avoid archives of this distribution
+\bPlucene-SearchEngine-Index-Extras-[\d\.\_]+
63 Makefile.PL
@@ -0,0 +1,63 @@
+
+use strict;
+use warnings;
+
+use 5.006;
+
+use ExtUtils::MakeMaker 6.30;
+
+
+
+my %WriteMakefileArgs = (
+ "ABSTRACT" => "a set of Plucene backends for indexing Microsoft Office formats",
+ "AUTHOR" => "Sopan Shewale <sopan.shewale\@gmail.com>, Mike Doherty <doherty\@pythian.com>",
+ "BUILD_REQUIRES" => {
+ "File::Find" => 0,
+ "Module::Build" => "0.3601",
+ "Test::More" => 0
+ },
+ "CONFIGURE_REQUIRES" => {
+ "Module::Build" => "0.3601"
+ },
+ "DISTNAME" => "Plucene-SearchEngine-Index-MSOffice",
+ "EXE_FILES" => [],
+ "LICENSE" => "perl",
+ "NAME" => "Plucene::SearchEngine::Index::MSOffice",
+ "PREREQ_PM" => {
+ "File::Temp" => 0,
+ "IPC::Run3" => 0,
+ "Plucene::SearchEngine::Index::Base" => 0,
+ "Plucene::SearchEngine::Index::HTML" => 0,
+ "Plucene::SearchEngine::Index::Text" => 0,
+ "Spreadsheet::ParseExcel" => 0,
+ "parent" => 0,
+ "strict" => 0,
+ "warnings" => 0
+ },
+ "VERSION" => "0.001",
+ "test" => {
+ "TESTS" => "t/*.t"
+ }
+);
+
+
+unless ( eval { ExtUtils::MakeMaker->VERSION(6.56) } ) {
+ my $br = delete $WriteMakefileArgs{BUILD_REQUIRES};
+ my $pp = $WriteMakefileArgs{PREREQ_PM};
+ for my $mod ( keys %$br ) {
+ if ( exists $pp->{$mod} ) {
+ $pp->{$mod} = $br->{$mod} if $br->{$mod} > $pp->{$mod};
+ }
+ else {
+ $pp->{$mod} = $br->{$mod};
+ }
+ }
+}
+
+delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
+ unless eval { ExtUtils::MakeMaker->VERSION(6.52) };
+
+WriteMakefile(%WriteMakefileArgs);
+
+
+
33 README
@@ -0,0 +1,33 @@
+NAME
+ MSOffice - a set of Plucene backends for indexing Microsoft Office
+ formats
+
+VERSION
+ version 0.001
+
+DESCRIPTION
+ These plugins provide indexing capabilities for Microsoft Office formats
+ (doc, ppt, xls).
+
+AVAILABILITY
+ The latest version of this module is available from the Comprehensive
+ Perl Archive Network (CPAN). Visit <http://www.perl.com/CPAN/> to find a
+ CPAN site near you, or see
+ <https://metacpan.org/module/Plucene::SearchEngine::Index::MSOffice/>.
+
+BUGS AND LIMITATIONS
+ You can make new bug reports, and view existing ones, through the web
+ interface at <http://rt.cpan.org>.
+
+AUTHORS
+ * Sopan Shewale <sopan.shewale@gmail.com>
+
+ * Mike Doherty <doherty@pythian.com>
+
+COPYRIGHT AND LICENSE
+ This software is copyright (c) 2012 by Sopan Shewale
+ <sopan.shewale@gmail.com>.
+
+ This is free software; you can redistribute it and/or modify it under
+ the same terms as the Perl 5 programming language system itself.
+
34 README.mkdn
@@ -0,0 +1,34 @@
+# NAME
+
+MSOffice - a set of Plucene backends for indexing Microsoft Office formats
+
+# VERSION
+
+version 0.001
+
+# DESCRIPTION
+
+These plugins provide indexing capabilities for Microsoft Office formats (doc, ppt, xls).
+
+# AVAILABILITY
+
+The latest version of this module is available from the Comprehensive Perl
+Archive Network (CPAN). Visit [http://www.perl.com/CPAN/](http://www.perl.com/CPAN/) to find a CPAN
+site near you, or see [https://metacpan.org/module/Plucene::SearchEngine::Index::MSOffice/](https://metacpan.org/module/Plucene::SearchEngine::Index::MSOffice/).
+
+# BUGS AND LIMITATIONS
+
+You can make new bug reports, and view existing ones, through the
+web interface at [http://rt.cpan.org](http://rt.cpan.org).
+
+# AUTHORS
+
+- Sopan Shewale <sopan.shewale@gmail.com>
+- Mike Doherty <doherty@pythian.com>
+
+# COPYRIGHT AND LICENSE
+
+This software is copyright (c) 2012 by Sopan Shewale <sopan.shewale@gmail.com>.
+
+This is free software; you can redistribute it and/or modify it under
+the same terms as the Perl 5 programming language system itself.
8 dist.ini
@@ -0,0 +1,8 @@
+name = Plucene-SearchEngine-Index-MSOffice
+main_module = lib/Plucene/SearchEngine/Index/MSOffice.pod
+author = Sopan Shewale <sopan.shewale@gmail.com>
+author = Mike Doherty <doherty@pythian.com>
+license = Perl_5
+copyright_holder = Sopan Shewale <sopan.shewale@gmail.com>
+
+[@Author::DOHERTY]
39 lib/Plucene/SearchEngine/Index/DOC.pm
@@ -0,0 +1,39 @@
+package Plucene::SearchEngine::Index::DOC;
+use strict;
+use warnings;
+# VERSION
+# ABSTRACT: a Plucene backend for indexing Microsoft Word documents
+use parent qw(Plucene::SearchEngine::Index::Text);
+
+use IPC::Run3;
+use File::Temp;
+
+__PACKAGE__->register_handler('application/doc', '.doc');
+
+=head1 DESCRIPTION
+
+This backend analyzes a DOC file for its textual content (using C<antiword>).
+
+=head1 METHODS
+
+=head2 gather_data_from_file
+
+Overrides the method from L<Plucene::SearchEngine::Index::Text>
+to provide DOC parsing.
+
+=cut
+
+sub gather_data_from_file {
+ my ($self, $filename) = @_;
+ return unless $filename =~ m/\.doc$/;
+
+ my $tmp_txt = File::Temp->new();
+ run3 ['antiword', $filename],
+ \undef, # stdin is /dev/null
+ $tmp_txt, # some temporary file
+ undef; # inherit the parent's stderr
+ $self->gather_data_from_file( $tmp_txt->filename );
+ return $self;
+}
+
+1;
7 lib/Plucene/SearchEngine/Index/MSOffice.pod
@@ -0,0 +1,7 @@
+# PODNAME: Plucene::SearchEngine::Index::MSOffice
+# ABSTRACT: a set of Plucene backends for indexing Microsoft Office formats
+# VERSION
+
+=head1 DESCRIPTION
+
+These plugins provide indexing capabilities for Microsoft Office formats (doc, ppt, xls).
56 lib/Plucene/SearchEngine/Index/PPT.pm
@@ -0,0 +1,56 @@
+package Plucene::SearchEngine::Index::PPT;
+use strict;
+use warnings;
+# VERSION
+# ABSTRACT: a Plucene backend for indexing Microsoft Powerpoint presentations
+use parent qw(Plucene::SearchEngine::Index::HTML);
+
+use IPC::Run3;
+use File::Temp;
+
+__PACKAGE__->register_handler('text/ppt', '.ppt');
+
+=head1 DESCRIPTION
+
+This backend analysis a PPT file. The module use the tool called
+ppthtml, provided by xlhtml packges available from
+L<http://chicago.sourceforge.net/xlhtml/>, or your operating
+system's package manager.
+
+=over 3
+
+=item text
+
+The text part of the PPT
+
+=item link
+
+A list of links in the HTML
+
+=back
+
+Additionally, any C<META> tags are turned into Plucene fields.
+
+=head1 METHODS
+
+=head2 gather_data_from_file
+
+Overrides the method from L<Plucene::SearchEngine::Index::HTML>
+to provide PPT parsing.
+=cut
+
+sub gather_data_from_file {
+ my ($self, $filename) = @_;
+ return unless $filename =~ m/\.ppt$/;
+
+ my $tmp_html = File::Temp->new();
+ run3 ['ppthtml', $filename],
+ \undef, # redirect from /dev/null
+ $tmp_html, # write to a temp file
+ undef; # inherit the parent's stderr
+
+ $self->gather_data_from_file( $tmp_html->filename );
+ return $self;
+}
+
+1;
94 lib/Plucene/SearchEngine/Index/XLS.pm
@@ -0,0 +1,94 @@
+package Plucene::SearchEngine::Index::XLS;
+use strict;
+use warnings;
+# VERSION
+# ABSTRACT: a Plucene backend for indexing Microsoft Excel spreadsheets
+
+use parent qw(Plucene::SearchEngine::Index::Base);
+
+__PACKAGE__->register_handler('application/xls', '.xls');
+use File::Temp qw/tmpnam/;
+use Spreadsheet::ParseExcel;
+
+
+=head1 NAME
+
+Plucene::SearchEngine::Index::Xls - Backend for plain text files
+
+=head1 DESCRIPTION
+
+This backend converts the .xls file into text file and the text file
+is used similar to Text.pm module.
+
+
+=head1 METHODS
+
+=head2 gather_data_from_file
+
+Overrides the method from L<Plucene::SearchEngine::Index::Base>
+to provide XLS parsing.
+
+=cut
+
+sub gather_data_from_file {
+ my ($self, $file) = @_;
+ return unless $file =~ m/\.xls$/;
+
+ if ($file =~ m/\.xls$/) { # Process only xls file data.
+ my $txtfile = tmpnam();
+ _exceltotext($file, $txtfile);
+ $file = $txtfile;
+ }
+ my $in;
+ if (exists $self->{encoding}) {
+ my $encoding = $self->{encoding}{data}[0];
+ open $in, "<:encoding($encoding)", $file
+ or die "Couldn't open $file: $!";
+ } else {
+ open $in, '<', $file
+ or die "Couldn't open $file: $!";
+ }
+ while (<$in>) {
+ $self->add_data('text' => 'UnStored' => $_);
+ }
+ unlink $file; #Remove the text file, part of maintenance.
+ return $self;
+}
+
+sub _exceltotext {
+ ##This is the standard code taken from SpreadSheet::ParseExcel Module.
+ my $excel = shift;
+ my $output = shift;
+
+ my $oExcel = Spreadsheet::ParseExcel->new();
+ open my $txt_out, '>', $output or die "Not able to open file : $!";
+
+ my $oBook = $oExcel->Parse($excel);
+ my($iC, $oWkS, $oWkC);
+
+ print $txt_out "FILE :", $oBook->{File} , "\n";
+ print $txt_out "COUNT :", $oBook->{SheetCount} , "\n";
+
+ print $txt_out "AUTHOR:", $oBook->{Author} , "\n"
+ if defined $oBook->{Author};
+
+ for(my $iSheet=0; $iSheet < $oBook->{SheetCount} ; $iSheet++) {
+ $oWkS = $oBook->{Worksheet}[$iSheet];
+ print OUTPUT $oWkS->{Name}, "\n";
+ for(my $iR = $oWkS->{MinRow} ;
+ defined $oWkS->{MaxRow} && $iR <= $oWkS->{MaxRow} ;
+ $iR++)
+ {
+ for(my $iC = $oWkS->{MinCol} ;
+ defined $oWkS->{MaxCol} && $iC <= $oWkS->{MaxCol} ;
+ $iC++)
+ {
+ $oWkC = $oWkS->{Cells}[$iR][$iC];
+ print OUTPUT $oWkC->Value, "\n" if($oWkC);
+ }
+ }
+ }
+ close($txt_out);
+}
+
+1;

0 comments on commit 3b03dee

Please sign in to comment.