Skip to content

Commit

Permalink
fixed langid based query stop words.
Browse files Browse the repository at this point in the history
  • Loading branch information
gigablast committed Mar 8, 2015
1 parent 2413a9b commit 5e65199
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
18 changes: 12 additions & 6 deletions Address.cpp
Expand Up @@ -273,7 +273,8 @@ static char *s_days[] = {
"wednesdays",
"thursdays",
"fridays",
"saturdays"
"saturdays",
NULL
};


Expand Down Expand Up @@ -336,7 +337,8 @@ static HashTableX s_doyTable;
static bool s_doyInit = false;
int32_t getDayOfWeek ( int64_t h ) {
if ( ! s_doyInit ) {
s_doyInit = initWordTable(&s_doyTable, s_days ,sizeof(s_days),
s_doyInit = initWordTable(&s_doyTable, s_days ,
//sizeof(s_days),
"doytbl");
if ( ! s_doyInit ) return -1;
}
Expand Down Expand Up @@ -3800,7 +3802,8 @@ static char *s_lcWords[] = {
"not",
"from",
"ll", // they'll this'll that'll you'll
"ve" // would've should've
"ve", // would've should've
NULL
};


Expand Down Expand Up @@ -6018,12 +6021,14 @@ bool Addresses::set2 ( ) {
"juror",
"chauffeur",
"butler",
"cheesemaker"
"cheesemaker",
NULL
};
static bool s_initJobs = false;
if ( ! s_initJobs ) {
// load it up
if ( ! initWordTable ( &s_jobTable,s_jobs,sizeof(s_jobs),
if ( ! initWordTable ( &s_jobTable,s_jobs,
//sizeof(s_jobs),
"jobstbl") )
return false;
// do not re-do
Expand Down Expand Up @@ -13745,7 +13750,8 @@ bool initPlaceDescTable ( ) {
// . has words that can be lower case in a place name
//s_lc.set ( 8 , 0 , 0 , s_lcbuf , 2000 , false , 0 ,"plnametbl");
// stock the table (StopWords.cpp function)
if ( ! initWordTable ( &s_lc , s_lcWords , sizeof(s_lcWords),
if ( ! initWordTable ( &s_lc , s_lcWords ,
//sizeof(s_lcWords),
"plnametbl")){
char *xx=NULL;*xx=0; }

Expand Down
30 changes: 20 additions & 10 deletions StopWords.cpp
Expand Up @@ -127,7 +127,7 @@ static char *s_stopWords[] = {
"under", // fix title for http://www.harwoodmuseum.org/press_detail.php?ID=44
"would",
"yours",
"theirs"
"theirs",
//"aren", // aren't
//"hadn", // hadn't
//"didn", // didn't
Expand All @@ -136,20 +136,25 @@ static char *s_stopWords[] = {
//"ve", // would've should've
//"should",
//"shouldn", // shouldn't
NULL
};
static HashTableX s_stopWordTable;
static bool s_stopWordsInitialized = false;

bool initWordTable( HashTableX *table, char *words[], int32_t size ,
bool initWordTable( HashTableX *table, char *words[],
//int32_t size ,
char *label ) {
// count them
int32_t count; for ( count = 0 ; words[count] ; count++ );
// set up the hash table
if ( ! table->set ( 8,4,size * 2,NULL,0,false,0,label ) )
if ( ! table->set ( 8,4,count * 2,NULL,0,false,0,label ) )
return log(LOG_INIT,"build: Could not init stop words "
"table." );
// now add in all the stop words
int32_t n = (int32_t)size/ sizeof(char *);
int32_t n = count;//(int32_t)size/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
char *sw = words[i];
if ( ! sw ) break;
int32_t swlen = gbstrlen ( sw );
int64_t swh = hash64Lower_utf8 ( sw , swlen );
//log("ii: #%"INT32" %s",i,sw);
Expand All @@ -162,7 +167,8 @@ bool isStopWord ( char *s , int32_t len , int64_t h ) {
if ( ! s_stopWordsInitialized ) {
s_stopWordsInitialized =
initWordTable(&s_stopWordTable, s_stopWords,
sizeof(s_stopWords),"stopwords");
//sizeof(s_stopWords),
"stopwords");
if (!s_stopWordsInitialized) return false;
}

Expand All @@ -178,7 +184,8 @@ bool isStopWord2 ( int64_t *h ) {
if ( ! s_stopWordsInitialized ) {
s_stopWordsInitialized =
initWordTable(&s_stopWordTable, s_stopWords,
sizeof(s_stopWords),"stopwrds2");
//sizeof(s_stopWords)
"stopwrds2");
if (!s_stopWordsInitialized) return false;
}

Expand Down Expand Up @@ -1696,7 +1703,7 @@ static char *s_queryStopWordsUnknown[] = {
//"er", // you,
//"sådan", // such
//"vår", // our
"blivit" // from
"blivit", // from
//"dess", // its
//"inom", // within
//"mellan", // between
Expand Down Expand Up @@ -1724,6 +1731,7 @@ static char *s_queryStopWordsUnknown[] = {

// additional stop words
//"san" // like san francisco
NULL
};


Expand Down Expand Up @@ -1767,7 +1775,8 @@ static char *s_queryStopWordsEnglish[] = {
"to",
"from",
"in",
"on"
"on",
NULL
};


Expand Down Expand Up @@ -2001,9 +2010,10 @@ static char *s_queryStopWordsGerman[] = {
//"würden", // would
"zu", // to
"zum", // zu
"zur" // zu
"zur", // zu
//"zwar", // indeed
//"zwischen", // between
NULL
};


Expand Down Expand Up @@ -2048,7 +2058,7 @@ bool isQueryStopWord ( char *s , int32_t len , int64_t h , int32_t langId ) {
if ( ! words ) continue;
if ( ! initWordTable ( ht,//&s_queryStopWordTable,
words,
sizeof(words),
//sizeof(words),
"qrystops") )
return false;
}
Expand Down
3 changes: 2 additions & 1 deletion StopWords.h
Expand Up @@ -30,7 +30,8 @@ int32_t isCommonWord ( int64_t h ) ;

int32_t isCommonQueryWordInEnglish ( int64_t h ) ;

bool initWordTable(class HashTableX *table, char* words[], int32_t size ,
bool initWordTable(class HashTableX *table, char* words[],
//int32_t size ,
char *label);

bool isVerb ( int64_t *hp ) ;
Expand Down

0 comments on commit 5e65199

Please sign in to comment.