Permalink
Cannot retrieve contributors at this time
767 lines (707 sloc)
19.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ###################################################################### | |
| # | |
| # EPrints::Index::Tokenizer | |
| # | |
| ###################################################################### | |
| # | |
| # | |
| ###################################################################### | |
| =pod | |
| =head1 NAME | |
| B<EPrints::Index::Tokenizer> - text indexing utility methods | |
| =head1 DESCRIPTION | |
| This module provides utility methods for processing free text into indexable things. | |
| =head1 METHODS | |
| =over 4 | |
| =cut | |
| package EPrints::Index::Tokenizer; | |
| use Unicode::Normalize qw( normalize ); | |
| ## Returns a basic Perl string containing normalized UTF-8 bytes | |
| sub _cast_string | |
| { | |
| my( $utext ) = @_; | |
| # ensure it's a UTF-8 Perl string | |
| if( ref($utext) eq "Unicode::String" ) | |
| { | |
| $utext = $utext->utf8; | |
| } | |
| else | |
| { | |
| utf8::encode($utext); | |
| } | |
| # fix malformed UTF-8 data | |
| $utext = Encode::decode("UTF-8", $utext, Encode::FB_DEFAULT); | |
| # normalize with compatibility-decompose + canonical-compose (NFKC) | |
| $utext = normalize( 'KC', $utext ); | |
| return $utext; | |
| } | |
| ###################################################################### | |
| =pod | |
| =item @words = EPrints::Index::Tokenizer::split_words( $session, $utext ) | |
| Splits a utf8 string into individual words. | |
| =cut | |
| ###################################################################### | |
| sub split_words | |
| { | |
| my( $session, $utext ) = @_; | |
| $utext = _cast_string( $utext ); | |
| return split /[^\w']+/, $utext; | |
| } | |
| =item @terms = EPrints::Index::Tokenizer::split_search_value( $session, $value ) | |
| Splits and returns $value into search terms. | |
| =cut | |
| sub split_search_value | |
| { | |
| my( $session, $value ) = @_; | |
| # transliterate to English | |
| $value = apply_mapping( $session, $value ); | |
| return split /[^\w'\*]+/, $value; | |
| } | |
| ###################################################################### | |
| =pod | |
| =item $utext2 = EPrints::Index::Tokenizer::apply_mapping( $session, $utext ) | |
| Replaces certain unicode characters with ASCII equivalents and returns | |
| the new string. | |
| This is used before indexing words so that things like umlauts will | |
| be ignored when searching. | |
| =cut | |
| ###################################################################### | |
| sub apply_mapping | |
| { | |
| my( $session, $utext ) = @_; | |
| $utext = _cast_string( $utext ); | |
| return join("", map { | |
| exists($EPrints::Index::FREETEXT_CHAR_MAPPING->{$_}) ? | |
| $EPrints::Index::FREETEXT_CHAR_MAPPING->{$_} : | |
| $_; | |
| } split(//, $utext)); | |
| } | |
| ############################################################################## | |
| # Mappings and character tables | |
| ############################################################################## | |
| # This map is used to convert Unicode characters | |
| # to ASCII characters below 127, in the word index. | |
| # This means that the word Fête is indexed as 'fete' and | |
| # "fete" or "fête" will match it. | |
| # There's no reason mappings have to be a single character. | |
| $EPrints::Index::FREETEXT_CHAR_MAPPING = { | |
| chr(0x0027) => "'", # ' | |
| chr(0x00a1) => '!', # ¡ | |
| chr(0x00a2) => 'c', # ¢ | |
| chr(0x00a3) => 'L', # £ | |
| chr(0x00a4) => 'o', # ¤ | |
| chr(0x00a5) => 'Y', # ¥ | |
| chr(0x00a6) => '|', # ¦ | |
| chr(0x00a7) => 'S', # § | |
| chr(0x00a8) => '"', # ¨ | |
| chr(0x00a9) => '(c)', # © | |
| chr(0x00aa) => 'a', # ª | |
| chr(0x00ab) => '<<', # « | |
| chr(0x00ac) => '-', # ¬ | |
| chr(0x00ad) => '-', # | |
| chr(0x00ae) => '(R)', # ® | |
| chr(0x00af) => '-', # ¯ | |
| chr(0x00b0) => 'o', # ° | |
| chr(0x00b1) => '+-', # ± | |
| chr(0x00b2) => '2', # ² | |
| chr(0x00b3) => '3', # ³ | |
| chr(0x00b5) => 'u', # µ | |
| chr(0x00b6) => 'q', # ¶ | |
| chr(0x00b7) => '.', # · | |
| chr(0x00b8) => ',', # ¸ | |
| chr(0x00b9) => '1', # ¹ | |
| chr(0x00ba) => 'o', # º | |
| chr(0x00bb) => '>>', # » | |
| chr(0x00bc) => '1/4', # ¼ | |
| chr(0x00bd) => '1/2', # ½ | |
| chr(0x00be) => '3/4', # ¾ | |
| chr(0x00bf) => '?', # ¿ | |
| chr(0x00c0) => 'A', # À | |
| chr(0x00c1) => 'A', # Á | |
| chr(0x00c2) => 'A', # Â | |
| chr(0x00c3) => 'A', # Ã | |
| chr(0x00c4) => 'AE', # Ä | |
| chr(0x00c6) => 'AE', # Æ | |
| chr(0x00c7) => 'C', # Ç | |
| chr(0x00c8) => 'E', # È | |
| chr(0x00c9) => 'E', # É | |
| chr(0x00ca) => 'E', # Ê | |
| chr(0x00cb) => 'E', # Ë | |
| chr(0x00cc) => 'I', # Ì | |
| chr(0x00cd) => 'I', # Í | |
| chr(0x00ce) => 'I', # Î | |
| chr(0x00cf) => 'I', # Ï | |
| chr(0x00d0) => 'D', # Ð | |
| chr(0x00d1) => 'N', # Ñ | |
| chr(0x00d2) => 'O', # Ò | |
| chr(0x00d3) => 'O', # Ó | |
| chr(0x00d4) => 'O', # Ô | |
| chr(0x00d5) => 'O', # Õ | |
| chr(0x00d6) => 'OE', # Ö | |
| chr(0x00d7) => 'x', # × | |
| chr(0x00d8) => 'O', # Ø | |
| chr(0x00d9) => 'U', # Ù | |
| chr(0x00da) => 'U', # Ú | |
| chr(0x00db) => 'U', # Û | |
| chr(0x00dc) => 'UE', # Ü | |
| chr(0x00dd) => 'Y', # Ý | |
| chr(0x00de) => 'TH', # Þ | |
| chr(0x00df) => 'ss', # ß | |
| chr(0x00e0) => 'a', # à | |
| chr(0x00e1) => 'a', # á | |
| chr(0x00e2) => 'a', # â | |
| chr(0x00e3) => 'a', # ã | |
| chr(0x00e4) => 'ae', # ä | |
| chr(0x00e5) => 'a', # å | |
| chr(0x00e6) => 'ae', # æ | |
| chr(0x00e7) => 'c', # ç | |
| chr(0x00e8) => 'e', # è | |
| chr(0x00e9) => 'e', # é | |
| chr(0x00ea) => 'e', # ê | |
| chr(0x00eb) => 'e', # ë | |
| chr(0x00ec) => 'i', # ì | |
| chr(0x00ed) => 'i', # í | |
| chr(0x00ee) => 'i', # î | |
| chr(0x00ef) => 'i', # ï | |
| chr(0x00f0) => 'd', # ð | |
| chr(0x00f1) => 'n', # ñ | |
| chr(0x00f2) => 'o', # ò | |
| chr(0x00f3) => 'o', # ó | |
| chr(0x00f4) => 'o', # ô | |
| chr(0x00f5) => 'o', # õ | |
| chr(0x00f6) => 'oe', # ö | |
| chr(0x00f7) => '/', # ÷ | |
| chr(0x00f8) => 'oe', # ø | |
| chr(0x00f9) => 'u', # ù | |
| chr(0x00fa) => 'u', # ú | |
| chr(0x00fb) => 'u', # û | |
| chr(0x00fc) => 'ue', # ü | |
| chr(0x00fd) => 'y', # ý | |
| chr(0x00fe) => 'th', # þ | |
| chr(0x00ff) => 'y', # ÿ | |
| chr(0x00c4) => 'AE', # Ä | |
| chr(0x00C5) => 'A', # Å | |
| chr(0x00c6) => 'AE', # Æ | |
| chr(0x00c7) => 'C', # Ç | |
| chr(0x00c8) => 'E', # È | |
| chr(0x00c9) => 'E', # É | |
| chr(0x00ca) => 'E', # Ê | |
| chr(0x00cb) => 'E', # Ë | |
| chr(0x00cc) => 'I', # Ì | |
| chr(0x00cd) => 'I', # Í | |
| chr(0x00ce) => 'I', # Î | |
| chr(0x00cf) => 'I', # Ï | |
| chr(0x00d0) => 'D', # Ð | |
| chr(0x00d1) => 'N', # Ñ | |
| chr(0x00d2) => 'O', # Ò | |
| chr(0x00d3) => 'O', # Ó | |
| chr(0x00d4) => 'O', # Ô | |
| chr(0x00d5) => 'O', # Õ | |
| chr(0x00d6) => 'OE', # Ö | |
| chr(0x00d7) => 'x', # × | |
| chr(0x00d8) => 'O', # Ø | |
| chr(0x00d9) => 'U', # Ù | |
| chr(0x00da) => 'U', # Ú | |
| chr(0x00db) => 'U', # Û | |
| chr(0x00dc) => 'UE', # Ü | |
| chr(0x00dd) => 'Y', # Ý | |
| chr(0x00de) => 'TH', # Þ | |
| chr(0x00df) => 'ss', # ß | |
| chr(0x00e0) => 'a', # à | |
| chr(0x00e1) => 'a', # á | |
| chr(0x00e2) => 'a', # â | |
| chr(0x00e3) => 'a', # ã | |
| chr(0x00e4) => 'ae', # ä | |
| chr(0x00e5) => 'a', # å | |
| chr(0x00e6) => 'ae', # æ | |
| chr(0x00e7) => 'c', # ç | |
| chr(0x00e8) => 'e', # è | |
| chr(0x00e9) => 'e', # é | |
| chr(0x00ea) => 'e', # ê | |
| chr(0x00eb) => 'e', # ë | |
| chr(0x00ec) => 'i', # ì | |
| chr(0x00ed) => 'i', # í | |
| chr(0x00ee) => 'i', # î | |
| chr(0x00ef) => 'i', # ï | |
| chr(0x00f0) => 'd', # ð | |
| chr(0x00f1) => 'n', # ñ | |
| chr(0x00f2) => 'o', # ò | |
| chr(0x00f3) => 'o', # ó | |
| chr(0x00f4) => 'o', # ô | |
| chr(0x00f5) => 'o', # õ | |
| chr(0x00f6) => 'oe', # ö | |
| chr(0x00f7) => '/', # ÷ | |
| chr(0x00f8) => 'oe', # ø | |
| chr(0x00f9) => 'u', # ù | |
| chr(0x00fa) => 'u', # ú | |
| chr(0x00fb) => 'u', # û | |
| chr(0x00fc) => 'ue', # ü | |
| chr(0x00fd) => 'y', # ý | |
| chr(0x00fe) => 'th', # þ | |
| chr(0x00ff) => 'y', # ÿ | |
| chr(0x0100) => 'A', | |
| chr(0x0101) => 'a', | |
| chr(0x0102) => 'a', | |
| chr(0x0103) => 'a', | |
| chr(0x0104) => 'A', | |
| chr(0x0105) => 'a', | |
| chr(0x0106) => 'C', | |
| chr(0x0107) => 'c', | |
| chr(0x0108) => 'C', | |
| chr(0x0109) => 'c', | |
| chr(0x010A) => 'C', | |
| chr(0x010B) => 'c', | |
| chr(0x010C) => 'C', | |
| chr(0x010D) => 'c', | |
| chr(0x010E) => 'D', | |
| chr(0x010F) => 'd', | |
| chr(0x0110) => 'D', | |
| chr(0x0111) => 'd', | |
| chr(0x0112) => 'E', | |
| chr(0x0113) => 'e', | |
| chr(0x0114) => 'E', | |
| chr(0x0115) => 'e', | |
| chr(0x0116) => 'E', | |
| chr(0x0117) => 'e', | |
| chr(0x0118) => 'E', | |
| chr(0x0119) => 'e', | |
| chr(0x011A) => 'E', | |
| chr(0x011B) => 'e', | |
| chr(0x011C) => 'G', | |
| chr(0x011D) => 'g', | |
| chr(0x011E) => 'G', | |
| chr(0x011F) => 'g', | |
| chr(0x0120) => 'G', | |
| chr(0x0121) => 'g', | |
| chr(0x0122) => 'G', | |
| chr(0x0123) => 'g', | |
| chr(0x0124) => 'H', | |
| chr(0x0125) => 'h', | |
| chr(0x0126) => 'H', | |
| chr(0x0127) => 'h', | |
| chr(0x0128) => 'I', | |
| chr(0x0129) => 'i', | |
| chr(0x012A) => 'I', | |
| chr(0x012B) => 'i', | |
| chr(0x012C) => 'I', | |
| chr(0x012D) => 'i', | |
| chr(0x012E) => 'I', | |
| chr(0x012F) => 'i', | |
| chr(0x0130) => 'I', | |
| chr(0x0131) => 'i', | |
| chr(0x0132) => 'IJ', # IJ | |
| chr(0x0133) => 'ij', # ij | |
| chr(0x0134) => 'J', | |
| chr(0x0135) => 'j', | |
| chr(0x0136) => 'K', | |
| chr(0x0137) => 'k', | |
| chr(0x0138) => 'k', | |
| chr(0x0139) => 'L', | |
| chr(0x013A) => 'l', | |
| chr(0x013B) => 'L', | |
| chr(0x013C) => 'l', | |
| chr(0x013D) => 'L', | |
| chr(0x013E) => 'l', | |
| chr(0x013F) => 'L', | |
| chr(0x0140) => 'l', | |
| chr(0x0141) => 'L', | |
| chr(0x0142) => 'l', | |
| chr(0x0143) => 'N', | |
| chr(0x0144) => 'n', | |
| chr(0x0145) => 'N', | |
| chr(0x0146) => 'n', | |
| chr(0x0147) => 'N', | |
| chr(0x0148) => 'n', | |
| chr(0x0149) => 'n', | |
| chr(0x014A) => 'N', | |
| chr(0x014B) => 'n', | |
| chr(0x014C) => 'O', | |
| chr(0x014D) => 'o', | |
| chr(0x014E) => 'O', | |
| chr(0x014F) => 'o', | |
| chr(0x0150) => 'OE', # Ö | |
| chr(0x0151) => 'oe', # ö | |
| chr(0x0152) => 'OE', # Œ | |
| chr(0x0153) => 'oe', # œ | |
| chr(0x0154) => 'R', | |
| chr(0x0155) => 'r', | |
| chr(0x0156) => 'R', | |
| chr(0x0157) => 'r', | |
| chr(0x0158) => 'R', | |
| chr(0x0159) => 'r', | |
| chr(0x015A) => 'S', | |
| chr(0x015B) => 's', | |
| chr(0x015C) => 'S', | |
| chr(0x015D) => 's', | |
| chr(0x015E) => 'S', | |
| chr(0x015F) => 's', | |
| chr(0x0160) => 'S', | |
| chr(0x0161) => 's', | |
| chr(0x0162) => 'T', | |
| chr(0x0163) => 't', | |
| chr(0x0164) => 'T', | |
| chr(0x0165) => 't', | |
| chr(0x0166) => 'T', | |
| chr(0x0167) => 't', | |
| chr(0x0168) => 'U', | |
| chr(0x0169) => 'u', | |
| chr(0x016A) => 'U', | |
| chr(0x016B) => 'u', | |
| chr(0x016C) => 'U', | |
| chr(0x016D) => 'u', | |
| chr(0x016E) => 'U', | |
| chr(0x016F) => 'u', | |
| chr(0x0170) => 'UE', # Ü | |
| chr(0x0171) => 'ue', # ü | |
| chr(0x0172) => 'U', | |
| chr(0x0173) => 'u', | |
| chr(0x0174) => 'W', | |
| chr(0x0175) => 'w', | |
| chr(0x0176) => 'Y', | |
| chr(0x0177) => 'y', | |
| chr(0x0178) => 'Y', | |
| chr(0x0179) => 'Z', | |
| chr(0x017A) => 'z', | |
| chr(0x017B) => 'Z', | |
| chr(0x017C) => 'z', | |
| chr(0x017D) => 'Z', | |
| chr(0x017E) => 'z', | |
| chr(0x0300) => '', # combining diacritical marks start | |
| chr(0x0301) => '', | |
| chr(0x0302) => '', | |
| chr(0x0303) => '', | |
| chr(0x0304) => '', | |
| chr(0x0305) => '', | |
| chr(0x0306) => '', | |
| chr(0x0307) => '', | |
| chr(0x0308) => '', | |
| chr(0x0309) => '', | |
| chr(0x030A) => '', | |
| chr(0x030B) => '', | |
| chr(0x030C) => '', | |
| chr(0x030D) => '', | |
| chr(0x030E) => '', | |
| chr(0x030F) => '', | |
| chr(0x0310) => '', | |
| chr(0x0311) => '', | |
| chr(0x0312) => '', | |
| chr(0x0313) => '', | |
| chr(0x0314) => '', | |
| chr(0x0315) => '', | |
| chr(0x0316) => '', | |
| chr(0x0317) => '', | |
| chr(0x0318) => '', | |
| chr(0x0319) => '', | |
| chr(0x031A) => '', | |
| chr(0x031B) => '', | |
| chr(0x031C) => '', | |
| chr(0x031D) => '', | |
| chr(0x031E) => '', | |
| chr(0x031F) => '', | |
| chr(0x0320) => '', | |
| chr(0x0321) => '', | |
| chr(0x0322) => '', | |
| chr(0x0323) => '', | |
| chr(0x0324) => '', | |
| chr(0x0325) => '', | |
| chr(0x0326) => '', | |
| chr(0x0327) => '', | |
| chr(0x0328) => '', | |
| chr(0x0329) => '', | |
| chr(0x032A) => '', | |
| chr(0x032B) => '', | |
| chr(0x032C) => '', | |
| chr(0x032D) => '', | |
| chr(0x032E) => '', | |
| chr(0x032F) => '', | |
| chr(0x0330) => '', | |
| chr(0x0331) => '', | |
| chr(0x0332) => '', | |
| chr(0x0333) => '', | |
| chr(0x0334) => '', | |
| chr(0x0335) => '', | |
| chr(0x0336) => '', | |
| chr(0x0337) => '', | |
| chr(0x0338) => '', | |
| chr(0x0339) => '', | |
| chr(0x033A) => '', | |
| chr(0x033B) => '', | |
| chr(0x033C) => '', | |
| chr(0x033D) => '', | |
| chr(0x033E) => '', | |
| chr(0x033F) => '', | |
| chr(0x0340) => '', | |
| chr(0x0341) => '', | |
| chr(0x0342) => '', | |
| chr(0x0343) => '', | |
| chr(0x0344) => '', | |
| chr(0x0345) => '', | |
| chr(0x0346) => '', | |
| chr(0x0347) => '', | |
| chr(0x0348) => '', | |
| chr(0x0349) => '', | |
| chr(0x034A) => '', | |
| chr(0x034B) => '', | |
| chr(0x034C) => '', | |
| chr(0x034D) => '', | |
| chr(0x034E) => '', | |
| chr(0x034F) => '', | |
| chr(0x0350) => '', | |
| chr(0x0351) => '', | |
| chr(0x0352) => '', | |
| chr(0x0353) => '', | |
| chr(0x0354) => '', | |
| chr(0x0355) => '', | |
| chr(0x0356) => '', | |
| chr(0x0357) => '', | |
| chr(0x0358) => '', | |
| chr(0x0359) => '', | |
| chr(0x035A) => '', | |
| chr(0x035B) => '', | |
| chr(0x035C) => '', | |
| chr(0x035D) => '', | |
| chr(0x035E) => '', | |
| chr(0x035F) => '', | |
| chr(0x0360) => '', | |
| chr(0x0361) => '', | |
| chr(0x0362) => '', # combining diacritical marks end | |
| chr(0x0391) => 'A', | |
| chr(0x03B1) => 'a', | |
| chr(0x0392) => 'B', | |
| chr(0x03B2) => 'b', | |
| chr(0x0393) => 'G', | |
| chr(0x03B3) => 'g', | |
| chr(0x0394) => 'D', | |
| chr(0x03B4) => 'd', | |
| chr(0x0395) => 'E', | |
| chr(0x03B5) => 'e', | |
| chr(0x0396) => 'Z', | |
| chr(0x03B6) => 'z', | |
| chr(0x0397) => 'E', | |
| chr(0x03B7) => 'e', | |
| chr(0x0398) => 'TH', | |
| chr(0x03B8) => 'th', | |
| chr(0x0399) => 'I', | |
| chr(0x03B9) => 'i', | |
| chr(0x039A) => 'K', | |
| chr(0x03BA) => 'k', | |
| chr(0x039B) => 'L', | |
| chr(0x03BB) => 'l', | |
| chr(0x039C) => 'M', | |
| chr(0x03BC) => 'm', | |
| chr(0x039D) => 'N', | |
| chr(0x03BD) => 'n', | |
| chr(0x039E) => 'X', | |
| chr(0x03BE) => 'x', | |
| chr(0x039F) => 'O', | |
| chr(0x03BF) => 'o', | |
| chr(0x03A0) => 'P', | |
| chr(0x03C0) => 'p', | |
| chr(0x03A1) => 'R', | |
| chr(0x03C1) => 'r', | |
| chr(0x03A3) => 'S', | |
| chr(0x03C3) => 's', | |
| chr(0x03A4) => 'T', | |
| chr(0x03C4) => 't', | |
| chr(0x03A5) => 'Y', | |
| chr(0x03C5) => 'y', | |
| chr(0x03A6) => 'Ph', | |
| chr(0x03C6) => 'ph', | |
| chr(0x03A7) => 'Ch', | |
| chr(0x03C7) => 'ch', | |
| chr(0x03A8) => 'Ps', | |
| chr(0x03C8) => 'ps', | |
| chr(0x03A9) => 'O', | |
| chr(0x03C9) => 'o', | |
| chr(0x03AA) => 'I', | |
| chr(0x03CA) => 'i', | |
| chr(0x03AB) => 'Y', | |
| chr(0x03CB) => 'y', | |
| chr(0x03AC) => 'a', | |
| chr(0x03AD) => 'e', | |
| chr(0x03AE) => 'e', | |
| chr(0x03AF) => 'i', | |
| chr(0x03B0) => 'y', | |
| chr(0x03CC) => 'o', | |
| chr(0x03CD) => 'y', | |
| chr(0x03CE) => 'o', | |
| chr(0x0386) => 'A', | |
| chr(0x0389) => 'E', | |
| chr(0x038A) => 'I', | |
| chr(0x038C) => 'O', | |
| chr(0x038E) => 'Y', | |
| chr(0x038F) => 'O', | |
| chr(0x0390) => 'i', | |
| chr(0x0387) => ';', | |
| chr(0x0363) => 'a', | |
| chr(0x0364) => 'e', | |
| chr(0x0365) => 'i', | |
| chr(0x0366) => 'o', | |
| chr(0x0367) => 'u', | |
| chr(0x0368) => 'c', | |
| chr(0x0369) => 'd', | |
| chr(0x036A) => 'h', | |
| chr(0x036B) => 'm', | |
| chr(0x036C) => 'r', | |
| chr(0x036D) => 't', | |
| chr(0x036E) => 'v', | |
| chr(0x036F) => 'x', | |
| chr(0x2010) => '-', | |
| chr(0x2011) => '-', | |
| chr(0x2012) => '-', | |
| chr(0x2013) => '-', | |
| chr(0x2014) => '-', | |
| chr(0x2019) => "'", # ’ | |
| chr(0x2074) => '4', | |
| chr(0x2075) => '5', | |
| chr(0x2076) => '6', | |
| chr(0x2077) => '7', | |
| chr(0x2078) => '8', | |
| chr(0x2079) => '9', | |
| chr(0x207A) => '+', | |
| chr(0x207B) => '-', | |
| chr(0x207C) => '=', | |
| chr(0x207D) => '(', | |
| chr(0x207E) => ')', | |
| chr(0x2080) => '0', | |
| chr(0x2081) => '1', | |
| chr(0x2082) => '2', | |
| chr(0x2083) => '3', | |
| chr(0x2084) => '4', | |
| chr(0x2085) => '5', | |
| chr(0x2086) => '6', | |
| chr(0x2087) => '7', | |
| chr(0x2088) => '8', | |
| chr(0x2089) => '9', | |
| chr(0x208A) => '+', | |
| chr(0x208B) => '-', | |
| chr(0x208C) => '=', | |
| chr(0x208D) => '(', | |
| chr(0x208E) => ')', | |
| chr(0x2090) => 'a', | |
| chr(0x2091) => 'e', | |
| chr(0x2092) => 'o', | |
| chr(0x2093) => 'x', | |
| chr(0x2094) => 'e', | |
| chr(0x2153) => '1/3', | |
| chr(0x2154) => '2/3', | |
| chr(0x2155) => '1/5', | |
| chr(0x2156) => '2/5', | |
| chr(0x2157) => '3/5', | |
| chr(0x2158) => '4/5', | |
| chr(0x2159) => '1/6', | |
| chr(0x215A) => '5/6', | |
| chr(0x215B) => '1/8', | |
| chr(0x215C) => '3/8', | |
| chr(0x215D) => '5/8', | |
| chr(0x215E) => '7/8', | |
| chr(0x215F) => '1/', | |
| chr(0x2160) => 'I', | |
| chr(0x2161) => 'II', | |
| chr(0x2162) => 'III', | |
| chr(0x2163) => 'IV', | |
| chr(0x2164) => 'V', | |
| chr(0x2165) => 'VI', | |
| chr(0x2166) => 'VII', | |
| chr(0x2167) => 'VIII', | |
| chr(0x2168) => 'IX', | |
| chr(0x2169) => 'X', | |
| chr(0x216A) => 'XI', | |
| chr(0x216B) => 'XII', | |
| chr(0x216C) => 'L', | |
| chr(0x216D) => 'C', | |
| chr(0x216E) => 'D', | |
| chr(0x216F) => 'M', | |
| chr(0x2170) => 'i', | |
| chr(0x2171) => 'ii', | |
| chr(0x2172) => 'iii', | |
| chr(0x2173) => 'iv', | |
| chr(0x2174) => 'v', | |
| chr(0x2175) => 'vi', | |
| chr(0x2176) => 'vii', | |
| chr(0x2177) => 'viii', | |
| chr(0x2178) => 'ix', | |
| chr(0x2179) => 'x', | |
| chr(0x217A) => 'xi', | |
| chr(0x217B) => 'xii', | |
| chr(0x217C) => 'l', | |
| chr(0x217D) => 'c', | |
| chr(0x217E) => 'd', | |
| chr(0x217F) => 'm', | |
| chr(0x2122) => 'TM', | |
| chr(0x25CC) => '', # dotted circle left when removing modifier for combining diacritical marks | |
| chr(0xFB00) => 'ff', # ff | |
| chr(0xFB01) => 'fi', # fi | |
| chr(0xFB02) => 'fl', # fl | |
| chr(0xFB03) => 'ffi', # ffi | |
| chr(0xFB04) => 'ffl', # ffl | |
| chr(0xFB05) => 'st', # ſt | |
| chr(0xFB06) => 'st', # st | |
| }; | |
| # Minimum size word to normally index. | |
| $EPrints::Index::FREETEXT_MIN_WORD_SIZE = 3; | |
| # We use a hash rather than an array for good and bad | |
| # words as we only use these to lookup if words are in | |
| # them or not. If we used arrays and we had lots of words | |
| # it might slow things down. | |
| # Words to never index, despite their length. | |
| $EPrints::Index::FREETEXT_STOP_WORDS = { | |
| "this"=>1, "are"=>1, "which"=>1, "with"=>1, | |
| "that"=>1, "can"=>1, "from"=>1, "these"=>1, | |
| "those"=>1, "the"=>1, "you"=>1, "for"=>1, | |
| "been"=>1, "have"=>1, "were"=>1, "what"=>1, | |
| "where"=>1, "is"=>1, "and"=>1, "fnord"=>1 | |
| }; | |
| # Words to always index, despite their length. | |
| $EPrints::Index::FREETEXT_ALWAYS_WORDS = { | |
| "ok" => 1 | |
| }; | |
| # Chars which separate words. Pretty much anything except | |
| # A-Z a-z 0-9 and single quote ' | |
| # If you want to add other seperator characters then they | |
| # should be encoded in utf8. The Unicode::String man page | |
| # details some useful methods. | |
| $EPrints::Index::FREETEXT_SEPERATOR_CHARS = { | |
| '@' => 1, '[' => 1, '\\' => 1, ']' => 1, | |
| '^' => 1, '_' => 1, ' ' => 1, '`' => 1, | |
| '!' => 1, '"' => 1, '#' => 1, '$' => 1, | |
| '%' => 1, '&' => 1, '(' => 1, ')' => 1, | |
| '*' => 1, '+' => 1, ',' => 1, '-' => 1, | |
| '.' => 1, '/' => 1, ':' => 1, ';' => 1, | |
| '{' => 1, '<' => 1, '|' => 1, '=' => 1, | |
| '}' => 1, '>' => 1, '~' => 1, '?' => 1, | |
| chr(0xb4) => 1, # Acute Accent (closing quote) | |
| }; | |
| $EPrints::Index::FREETEXT_SEPERATOR_REGEXP = quotemeta(join "", keys %$EPrints::Index::FREETEXT_SEPERATOR_CHARS); | |
| $EPrints::Index::FREETEXT_SEPERATOR_REGEXP = qr/[$EPrints::Index::FREETEXT_SEPERATOR_REGEXP\x00-\x20]/; | |
| # Compatibility with Unicode::String keys | |
| foreach my $mapping ( | |
| $EPrints::Index::FREETEXT_CHAR_MAPPING, | |
| $EPrints::Index::FREETEXT_SEPERATOR_CHARS | |
| ) | |
| { | |
| foreach my $char (keys %$mapping) | |
| { | |
| my $bytes = $char; | |
| utf8::encode($bytes); | |
| $mapping->{$bytes} = $mapping->{$char}; | |
| } | |
| } | |
| 1; | |
| =head1 COPYRIGHT | |
| =for COPYRIGHT BEGIN | |
| Copyright 2000-2011 University of Southampton. | |
| =for COPYRIGHT END | |
| =for LICENSE BEGIN | |
| This file is part of EPrints L<http://www.eprints.org/>. | |
| EPrints is free software: you can redistribute it and/or modify it | |
| under the terms of the GNU Lesser General Public License as published | |
| by the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| EPrints is distributed in the hope that it will be useful, but WITHOUT | |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| License for more details. | |
| You should have received a copy of the GNU Lesser General Public | |
| License along with EPrints. If not, see L<http://www.gnu.org/licenses/>. | |
| =for LICENSE END | |