Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base: 181d8a12b0
...
compare: 6e8b145896
Checking mergeability… Don't worry, you can still create the pull request.
  • 6 commits
  • 11 files changed
  • 0 commit comments
  • 1 contributor
View
86 include/DisambigComp.h
@@ -1,86 +0,0 @@
-/*
- * DisambigComp.h
- *
- * Created on: Dec 9, 2010
- * Author: ysun
- */
-
-#ifndef DISAMBIGCOMP_H_
-#define DISAMBIGCOMP_H_
-
-#include <iostream>
-#include <string>
-#include <map>
-#include <vector>
-using std::string;
-using std::map;
-using std::vector;
-
-const unsigned int Jaro_Wrinkler_Max = 5;
-
-template <typename Tp>
-inline const Tp& max_val(const Tp& arg1, const Tp &arg2) {
- return ( arg1 < arg2 )? arg2 : arg1;
-}
-
-template <typename Tp>
-inline const Tp& min_val(const Tp& arg1, const Tp &arg2) {
- return ( arg1 < arg2 )? arg1 : arg2;
-}
-
-template <typename Tp, typename Functor>
-vector <Tp> Longest_Common_Subsequence(const vector <Tp> & s1, const vector <Tp> &s2, const Functor & func);
-
-
-char * extract_initials(char * dest, const char * source) ;
-int nospacecmp(const char* str1, const char* str2);
-int jwcmp(const string & str1, const string& str2);
-int midnamecmp(const string & str1, const string & str2 );
-int countrycmp(const string & country1, const string & country2 );
-int streetcmp(const string& inputstreet1, const string& inputstreet2);
-int latloncmp(const string & inputlat1, const string & inputlon1, const string & inputlat2, const string & inputlon2 );
-int classcmp(const string &class1, const string& class2 );
-int coauthorcmp(const string &coauthor1, const string& coauthor2 );
-int asgcmp(const string & asg1, const string & asg2, const map<string, std::pair<string, unsigned int> > * const asg_table_pointer);
-int asgcmp ( const string & s1, const string &s2) ;
-int asgcmp_to_test(const vector <string> & asg1, const vector <string> & asg2,
- const map<string, std::pair<string, unsigned int> > * const asg_table_pointer);
-int name_compare( const string & s1, const string & s2, const unsigned int prev, const unsigned int cur);
-
-
-class cSentence_JWComparator {
-private:
- const double threshold;
-public:
- bool operator() (const string * s1, const string * s2) const;
- explicit cSentence_JWComparator(const double inputthreshold): threshold(inputthreshold){};
-};
-
-
-template < typename Iter1, typename Iter2 >
-unsigned int num_common_elements ( const Iter1 & p1begin, const Iter1 & p1e ,
- const Iter2 & p2begin, const Iter2 & p2e, const unsigned int max) {// containers must be sorted before use.
- // it has to be a sorted version container, like set, or sorted vector or list
- unsigned int cnt = 0;
- Iter1 p1b = p1begin;
- Iter2 p2b = p2begin;
- while ( p1b != p1e && p2b != p2e ) {
- if ( *p1b < *p2b ) {
- ++p1b;
- }
- else if ( *p2b < *p1b ) {
- ++p2b;
- }
- else {
- ++cnt;
- ++p1b;
- ++p2b;
- }
-
- if ( cnt == max && max != 0 )
- break;
- }
- return cnt;
-}
-
-#endif /* DISAMBIGCOMP_H_ */
View
2  include/attribute.h
@@ -33,7 +33,7 @@ void reconfigure_interactives ( const Record_Reconfigurator * pc, const Record *
#include "exceptions.h"
-#include "DisambigComp.h"
+#include "comparators.h"
/*
View
105 include/comparators.h
@@ -0,0 +1,105 @@
+#ifndef PATENT_COMPARATORS_H
+#define PATENT_COMPARATORS_H
+
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+using std::string;
+using std::map;
+using std::vector;
+
+const unsigned int Jaro_Wrinkler_Max = 5;
+
+template <typename Tp>
+inline const Tp& max_val(const Tp& arg1, const Tp &arg2) {
+ return ( arg1 < arg2 )? arg2 : arg1;
+}
+
+template <typename Tp>
+inline const Tp& min_val(const Tp& arg1, const Tp &arg2) {
+ return ( arg1 < arg2 )? arg1 : arg2;
+}
+
+template <typename Tp, typename Functor>
+vector <Tp> Longest_Common_Subsequence(const vector <Tp> & s1, const vector <Tp> &s2, const Functor & func);
+
+
+char * extract_initials(char * dest,
+ const char * source);
+
+int nospacecmp(const char* str1,
+ const char* str2);
+
+int jwcmp(const string & str1,
+ const string & str2);
+
+int midnamecmp(const string & str1,
+ const string & str2 );
+
+int countrycmp(const string & country1,
+ const string & country2 );
+
+int streetcmp(const string & inputstreet1,
+ const string & inputstreet2);
+
+int latloncmp(const string & inputlat1,
+ const string & inputlon1,
+ const string & inputlat2,
+ const string & inputlon2 );
+
+int classcmp(const string & class1,
+ const string & class2);
+
+int coauthorcmp(const string & coauthor1,
+ const string & coauthor2);
+
+int asgcmp(const string & asg1,
+ const string & asg2,
+ const map<string, std::pair<string, unsigned int> > * const asg_table_pointer);
+
+int asgcmp (const string & s1,
+ const string &s2);
+
+int asgcmp_to_test(const vector <string> & asg1, const vector <string> & asg2,
+ const map<string, std::pair<string, unsigned int> > * const asg_table_pointer);
+
+int name_compare( const string & s1, const string & s2, const unsigned int prev, const unsigned int cur);
+
+
+class cSentence_JWComparator {
+private:
+ const double threshold;
+public:
+ bool operator() (const string * s1, const string * s2) const;
+ explicit cSentence_JWComparator(const double inputthreshold): threshold(inputthreshold){};
+};
+
+
+template < typename Iter1, typename Iter2 >
+unsigned int num_common_elements ( const Iter1 & p1begin, const Iter1 & p1e ,
+ const Iter2 & p2begin, const Iter2 & p2e, const unsigned int max) {// containers must be sorted before use.
+ // it has to be a sorted version container, like set, or sorted vector or list
+ unsigned int cnt = 0;
+ Iter1 p1b = p1begin;
+ Iter2 p2b = p2begin;
+ while ( p1b != p1e && p2b != p2e ) {
+ if ( *p1b < *p2b ) {
+ ++p1b;
+ }
+ else if ( *p2b < *p1b ) {
+ ++p2b;
+ }
+ else {
+ ++cnt;
+ ++p1b;
+ ++p2b;
+ }
+
+ if ( cnt == max && max != 0 )
+ break;
+ }
+ return cnt;
+}
+
+#endif /* PATENT_COMPARATORS_H */
View
2  src/Makefile.am
@@ -18,7 +18,7 @@ CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_sles10_4.1/static_pic
INCLUDES = -I/usr/local/include -I$(CPLEXINCLUDE) -I$(CONCERTINCLUDE) -I../include
noinst_LIBRARIES = libdisambiguation.a
-libdisambiguation_a_SOURCES = Disambigmain.cpp DisambigCluster.cpp DisambigComp.cpp attribute.cpp \
+libdisambiguation_a_SOURCES = Disambigmain.cpp DisambigCluster.cpp comparators.cpp attribute.cpp \
DisambigEngine.cpp DisambigFileOper.cpp DisambigNewCluster.cpp \
DisambigPostProcess.cpp DisambigRatios.cpp DisambigRatioSmoothing.cpp \
DisambigTraining.cpp DisambigUtilities.cpp Threading.cpp strcmp95.c record.cpp \
View
502 src/DisambigComp.cpp → src/comparators.cpp
@@ -1,11 +1,4 @@
-/**
- * DisambigComp.cpp
- *
- * Created on: Dec 9, 2010
- * Author: ysun
- */
-#include "DisambigComp.h"
#include <cmath>
#include <cstring>
#include <cstdlib>
@@ -15,8 +8,10 @@
#include <functional>
#include <stdexcept>
+#include "comparators.h"
+
extern "C" {
- #include "strcmp95.h"
+ #include "strcmp95.h"
}
using std::list;
@@ -26,59 +21,59 @@ using std::list;
template <typename Tp, typename Functor>
vector <Tp> Longest_Common_Subsequence_Incontinuous(const vector <Tp> & s1, const vector <Tp> &s2, const Functor & func) {
- static const vector < Tp > emptyresult;
+ static const vector < Tp > emptyresult;
if(s1.empty()||s2.empty())
return emptyresult;
const int m=s1.size()+1;
const int n=s2.size()+1;
- vector <int> row(n, 0);
- vector < vector <int> > lcs(m, row);
+ vector <int> row(n, 0);
+ vector < vector <int> > lcs(m, row);
//int lcs[100][100];
int i,j;
for(i=0;i<m;i++)
for(j=0;j<n;j++)
lcs[i][j]=0;
-
-
+
+
for(i=1;i<m;i++) {
for(j=1;j<n;j++)
{
//if(s1[i-1]==s2[j-1])
- if ( func( s1[i-1], s2[j-1] ) )
+ if ( func( s1[i-1], s2[j-1] ) )
lcs[i][j]=lcs[i-1][j-1]+1;
else
lcs[i][j]=lcs[i-1][j]>=lcs[i][j-1]?lcs[i-1][j]:lcs[i][j-1];//get the upper or lefter max value
}
}
- i=m-2;
- j=n-2;
- list < Tp > ss;
- while(i!=-1 && j!=-1)
- {
- //if(s1[i]==s2[j])
- if ( func( s1[i], s2[j] ) )
- {
- ss.push_front(s1[i]);
- i--;
- j--;
- }
- else
- {
- if(lcs[i+1][j+1]==lcs[i][j])
- {
- i--;
- j--;
- }
- else
- {
- if(lcs[i][j+1]>=lcs[i+1][j])
- i--;
- else
- j--;
- }
- }
- }
-
+ i=m-2;
+ j=n-2;
+ list < Tp > ss;
+ while(i!=-1 && j!=-1)
+ {
+ //if(s1[i]==s2[j])
+ if ( func( s1[i], s2[j] ) )
+ {
+ ss.push_front(s1[i]);
+ i--;
+ j--;
+ }
+ else
+ {
+ if(lcs[i+1][j+1]==lcs[i][j])
+ {
+ i--;
+ j--;
+ }
+ else
+ {
+ if(lcs[i][j+1]>=lcs[i+1][j])
+ i--;
+ else
+ j--;
+ }
+ }
+ }
+
vector < Tp > ans (ss.begin(), ss.end());
return ans;
}
@@ -86,7 +81,7 @@ vector <Tp> Longest_Common_Subsequence_Incontinuous(const vector <Tp> & s1, cons
template <typename Tp, typename Functor>
vector <Tp> Longest_Common_Subsequence_Continuous(const vector <Tp> & s1, const vector <Tp> &s2, const Functor & func) {
- static const vector < Tp > emptyresult;
+ static const vector < Tp > emptyresult;
if (s1.empty() || s2.empty() )
return emptyresult;
@@ -98,60 +93,61 @@ vector <Tp> Longest_Common_Subsequence_Continuous(const vector <Tp> & s1, const
maxj = 0 ;
max = 0;
for( i = 0; i < n ; ++i ) {
- for( j = m - 1 ; j >= 0 ; --j ) {
- if( func (s2[i], s1[j] ) ) {
- if ( i == 0 || j == 0 )
- c[j] = 1;
- else
- c[j] = c[j-1] + 1;
- }
- else
- c[j]=0;
- if( c[j] > max ) {
- max = c[j];
- maxj = j;
- }
- }
+ for( j = m - 1 ; j >= 0 ; --j ) {
+ if( func (s2[i], s1[j] ) ) {
+ if ( i == 0 || j == 0 )
+ c[j] = 1;
+ else
+ c[j] = c[j-1] + 1;
+ }
+ else
+ c[j]=0;
+ if( c[j] > max ) {
+ max = c[j];
+ maxj = j;
+ }
+ }
}
if( max == 0 )
- return emptyresult;
+ return emptyresult;
vector <Tp> ss ( emptyresult);
- for( j = maxj - max + 1; j <= maxj ; ++j )
- ss.push_back( s1[j] );
- return ss;
+ for( j = maxj - max + 1; j <= maxj ; ++j )
+ ss.push_back( s1[j] );
+ return ss;
}
inline bool
-cSentence_JWComparator:: operator()(const string * ps1, const string * ps2) const {
+cSentence_JWComparator::operator()(const string * ps1, const string * ps2) const {
const double compres = strcmp95_modified(ps1->c_str(), ps2->c_str());
return compres > threshold;
};
-char *
+char *
extract_initials(char * dest, const char * source) {
- if ( source == NULL || dest == NULL )
- return NULL;
- char * ret = dest;
- static const char delim = ' ';
- while ( *source != '\0') {
- while ( *source == delim )
- ++source;
- *dest++ = *source;
- while ( *source != delim && *source != '\0' )
- ++source;
- }
- *dest = '\0';
- return ret;
+ if ( source == NULL || dest == NULL )
+ return NULL;
+ char * ret = dest;
+ static const char delim = ' ';
+ while ( *source != '\0') {
+ while ( *source == delim )
+ ++source;
+ *dest++ = *source;
+ while ( *source != delim && *source != '\0' )
+ ++source;
+ }
+ *dest = '\0';
+ return ret;
};
int
-nospacecmp(const char* str1, const char* str2){
+nospacecmp(const char* str1, const char* str2) {
+
const char *c1, *c2;
const char delim = ' ';
for(c1 = str1, c2=str2; (*c1 != '\0') && (*c2 != '\0'); ++c1, ++c2 ){
@@ -166,7 +162,8 @@ nospacecmp(const char* str1, const char* str2){
int
-jwcmp_old(const string & str1, const string& str2){
+jwcmp_old(const string & str1, const string& str2) {
+
const char *delim= " ";
const unsigned int delim_size = strlen(delim);
const double threshold = 0.95;
@@ -183,144 +180,156 @@ jwcmp_old(const string & str1, const string& str2){
size_t pos1, prev_pos1, pos2, prev_pos2;
pos1 = prev_pos1 = 0;
num_tok1 = 0;
- do {
- tok_score = 0;
- pos1 = str1.find(delim, prev_pos1);
- string temp1 = str1.substr(prev_pos1, pos1 - prev_pos1);
- tok_len1 = temp1.size();
- num_tok1 += (tok_len1 > 1);
-
- pos2 = prev_pos2 = 0;
- num_tok2 = 0;
- do {
- pos2 = str2.find(delim, prev_pos2);
- string temp2 = str2.substr(prev_pos2, pos2 - prev_pos2);
- tok_len2 = temp2.size();
- num_tok2 += (tok_len2 > 1);
- tok_score = max_val<int>(tok_score,
- ((min_val<int>(tok_len1, tok_len2) <= 1) ? 0 : strcmp95_modified(temp1.c_str(), temp2.c_str())));
-
- prev_pos2 = pos2 + delim_size;
- } while ( pos2!= string::npos);
- score += (tok_score > threshold);
-
- prev_pos1 = pos1 + delim_size;
- } while ( pos1!= string::npos);
-
- int min_num_tok = min_val<int>(num_tok1, num_tok2);
- double myres = ( min_num_tok == 0) ? 0 : score/min_num_tok;
- int is_same_len = (num_tok1 == num_tok2) ? 1 : 0;
- return( 2*(myres >= 0.33) + (myres >= 0.66) + (myres > 0.99) + (myres > 0.99 && min_num_tok >= 2) + (myres > 0.99 && is_same_len));
+ do {
+ tok_score = 0;
+ pos1 = str1.find(delim, prev_pos1);
+ string temp1 = str1.substr(prev_pos1, pos1 - prev_pos1);
+ tok_len1 = temp1.size();
+ num_tok1 += (tok_len1 > 1);
+
+ pos2 = prev_pos2 = 0;
+ num_tok2 = 0;
+ do {
+ pos2 = str2.find(delim, prev_pos2);
+ string temp2 = str2.substr(prev_pos2, pos2 - prev_pos2);
+ tok_len2 = temp2.size();
+ num_tok2 += (tok_len2 > 1);
+ tok_score = max_val<int>(tok_score,
+ ((min_val<int>(tok_len1, tok_len2) <= 1) ? 0 : strcmp95_modified(temp1.c_str(), temp2.c_str())));
+
+ prev_pos2 = pos2 + delim_size;
+ } while ( pos2!= string::npos);
+ score += (tok_score > threshold);
+
+ prev_pos1 = pos1 + delim_size;
+ } while ( pos1!= string::npos);
+
+ int min_num_tok = min_val<int>(num_tok1, num_tok2);
+ double myres = ( min_num_tok == 0) ? 0 : score/min_num_tok;
+ int is_same_len = (num_tok1 == num_tok2) ? 1 : 0;
+ return( 2*(myres >= 0.33) + (myres >= 0.66) + (myres > 0.99) + (myres > 0.99 && min_num_tok >= 2) + (myres > 0.99 && is_same_len));
}
-int jwcmp(const string & str1, const string& str2) {
- if ( str1.empty() || str2.empty() )
- return 0;
- double cmpres = strcmp95_modified(str1.c_str(), str2.c_str());
- register int score = 0;
- if ( cmpres > 0.7 )
- ++score;
- if ( cmpres > 0.8 )
- ++score;
- if ( cmpres > 0.9 )
- ++score;
- if ( cmpres > 0.95 )
- ++score;
- if ( cmpres > 0.99 )
- ++score;
-
- return score;
+
+int
+jwcmp(const string & str1, const string& str2) {
+
+ if ( str1.empty() || str2.empty() )
+ return 0;
+
+ double cmpres = strcmp95_modified(str1.c_str(), str2.c_str());
+ register int score = 0;
+ if ( cmpres > 0.7 )
+ ++score;
+ if ( cmpres > 0.8 )
+ ++score;
+ if ( cmpres > 0.9 )
+ ++score;
+ if ( cmpres > 0.95 )
+ ++score;
+ if ( cmpres > 0.99 )
+ ++score;
+
+ return score;
}
-int midnamecmp_old(const string & str1, const string & str2 ){
-
- const char * delim = " ";
- const unsigned int delim_size = strlen(delim);
- int num_names_1 = 0, num_names_2 = 0;
- double matches = 0;
- size_t pos1, prev_pos1, pos2, prev_pos2;
- pos1 = prev_pos1 = 0;
- while ( ( pos1 = str1.find(delim, prev_pos1)) != string::npos ) {
- ++ num_names_1;
- pos2 = prev_pos2 = 0;
- while ( ( pos2 = str2.find(delim, prev_pos2)) != string::npos ) {
- ++num_names_2;
- if ( str1.at(pos1 + delim_size) == str2.at(pos2 + delim_size) )
- matches += 1;
- prev_pos2 = pos2 + delim_size;
- }
- prev_pos1 = pos1 + delim_size;
- }
-
- int min_num = min_val<int>(num_names_1, num_names_2);
- int missing = ( min_num == 0 )? 1:0 ;
- double raw = missing? 0: matches/min_num ;
- return (missing + 2*(raw > 0.33) + (raw > 0.67) + (raw > 0.99));
+
+int
+midnamecmp_old(const string & str1, const string & str2 ) {
+
+ const char * delim = " ";
+ const unsigned int delim_size = strlen(delim);
+ int num_names_1 = 0, num_names_2 = 0;
+ double matches = 0;
+ size_t pos1, prev_pos1, pos2, prev_pos2;
+ pos1 = prev_pos1 = 0;
+ while ( ( pos1 = str1.find(delim, prev_pos1)) != string::npos ) {
+ ++ num_names_1;
+ pos2 = prev_pos2 = 0;
+ while ( ( pos2 = str2.find(delim, prev_pos2)) != string::npos ) {
+ ++num_names_2;
+ if ( str1.at(pos1 + delim_size) == str2.at(pos2 + delim_size) )
+ matches += 1;
+ prev_pos2 = pos2 + delim_size;
+ }
+ prev_pos1 = pos1 + delim_size;
+ }
+
+ int min_num = min_val<int>(num_names_1, num_names_2);
+ int missing = ( min_num == 0 )? 1:0 ;
+ double raw = missing? 0: matches/min_num ;
+ return (missing + 2*(raw > 0.33) + (raw > 0.67) + (raw > 0.99));
}
-int midnamecmp_old2(const string & str1, const string & str2 ){
-
- static std::equal_to<char> char_compare;
- /*
- const char * delim = " ";
- const unsigned int delim_size = strlen(delim);
-
- size_t pos, prev_pos;
- pos = prev_pos = 0;
- vector < char > vec1, vec2;
- while ( ( pos = str1.find(delim, prev_pos)) != string::npos ) {
- prev_pos = pos + delim_size;
- vec1.push_back(str1.at(prev_pos));
- }
- pos = prev_pos = 0;
- while ( ( pos = str2.find(delim, prev_pos)) != string::npos ) {
- prev_pos = pos + delim_size;
- vec2.push_back(str2.at(prev_pos));
- }
- */
- const vector < char > vec1(str1.begin(), str1.end() );
- const vector < char > vec2(str2.begin(), str2.end() );
-
- if ( vec1.empty() && vec2.empty() )
- return 2;
-
- if ( vec1.empty() || vec2.empty() )
- return 1;
-
- int score;
- const int matches = Longest_Common_Subsequence_Continuous<char, std::equal_to<char> >(vec1, vec2, char_compare).size();
-
- if ( matches == min_val<int>(str1.size(), str2.size() ) )
- score = 3;
- else
- score = 0;
-
- return score;
+
+int
+midnamecmp_old2(const string & str1, const string & str2 ) {
+
+ static std::equal_to<char> char_compare;
+ /*
+ const char * delim = " ";
+ const unsigned int delim_size = strlen(delim);
+
+ size_t pos, prev_pos;
+ pos = prev_pos = 0;
+ vector < char > vec1, vec2;
+ while ( ( pos = str1.find(delim, prev_pos)) != string::npos ) {
+ prev_pos = pos + delim_size;
+ vec1.push_back(str1.at(prev_pos));
+ }
+ pos = prev_pos = 0;
+ while ( ( pos = str2.find(delim, prev_pos)) != string::npos ) {
+ prev_pos = pos + delim_size;
+ vec2.push_back(str2.at(prev_pos));
+ }
+ */
+ const vector < char > vec1(str1.begin(), str1.end() );
+ const vector < char > vec2(str2.begin(), str2.end() );
+
+ if ( vec1.empty() && vec2.empty() )
+ return 2;
+
+ if ( vec1.empty() || vec2.empty() )
+ return 1;
+
+ int score;
+ const int matches = Longest_Common_Subsequence_Continuous<char, std::equal_to<char> >(vec1, vec2, char_compare).size();
+
+ if ( matches == min_val<int>(str1.size(), str2.size() ) )
+ score = 3;
+ else
+ score = 0;
+
+ return score;
}
-int midnamecmp ( const string & s1, const string & s2) {
- if ( s1.empty() && s2.empty() )
- return 2;
- if ( s1.empty() || s2.empty() )
- return 1;
+int
+midnamecmp (const string & s1, const string & s2) {
+
+ if ( s1.empty() && s2.empty() )
+ return 2;
- const char * p1 = s1.c_str();
- const char * p2 = s2.c_str();
+ if ( s1.empty() || s2.empty() )
+ return 1;
- while ( *p1 != '\0' && *p2 != '\0') {
- if ( *p1++ != * p2++)
- return 0;
- }
- return 3;
+ const char * p1 = s1.c_str();
+ const char * p2 = s2.c_str();
+
+ while ( *p1 != '\0' && *p2 != '\0') {
+ if ( *p1++ != * p2++)
+ return 0;
+ }
+ return 3;
}
+int
+distcmp(const string & inputlat1, const string & inputlon1, const string & inputctry1, const char * inputstreet1,
+ const string & inputlat2, const string & inputlon2, const string & inputctry2, const char * inputstreet2) {
-int distcmp(const string & inputlat1, const string & inputlon1, const string & inputctry1, const char * inputstreet1,
- const string & inputlat2, const string & inputlon2, const string & inputctry2, const char * inputstreet2 ){
- /*
+ /*
// printf("DISTCOMP:\n");
// Extreme points of contiguous 48
double northernmost=4938;
@@ -357,8 +366,8 @@ int distcmp(const string & inputlat1, const string & inputlon1, const string & i
2*(dist < 100) + (dist < 75) + (dist < 50) + (dist < 10);
*/
- static const double R = 3963.0; //radius of the earth is 6378.1km = 3963 miles
- static const double DEG2RAD = 5729.58;
+ static const double R = 3963.0; //radius of the earth is 6378.1km = 3963 miles
+ static const double DEG2RAD = 5729.58;
static const double northernmost = 4938;
static const double southernmost = 2454;
static const double easternmost = -6695;
@@ -371,11 +380,11 @@ int distcmp(const string & inputlat1, const string & inputlon1, const string & i
const double missing_val = 0.0001;
int missing = ( ( fabs(lat1) < missing_val && fabs(lon1) < missing_val ) ||
- ( fabs(lat2) < missing_val && fabs(lon2) < missing_val) ) ? 1 : 0;
- int in_us = ( lat1 < northernmost && lat1 > southernmost &&
- lon1 < easternmost && lon1 > westernmost &&
- lat2 < northernmost && lat2 > southernmost &&
- lon2 < easternmost && lon2 > westernmost ) ? 1 : 0;
+ ( fabs(lat2) < missing_val && fabs(lon2) < missing_val) ) ? 1 : 0;
+ int in_us = ( lat1 < northernmost && lat1 > southernmost &&
+ lon1 < easternmost && lon1 > westernmost &&
+ lat2 < northernmost && lat2 > southernmost &&
+ lon2 < easternmost && lon2 > westernmost ) ? 1 : 0;
const double radlat1 = lat1/DEG2RAD;
const double radlon1 = lon1/DEG2RAD;
@@ -419,15 +428,24 @@ int distcmp(const string & inputlat1, const string & inputlon1, const string & i
}
+/**
+ * This function should take float arguments, because floats are
+ * supplied in the schema. Somewhere, they are being turned into
+ * strings, then have to be turned back into floats here.
+ * Also, 0 lat, 0 lon is a viable location, in the Gulf of
+ * Guinea in fact. This function should not imply that
+ * the distance is "missing." Also, given this is computing a
+ * distance, having anything other than zero returned for
+ * 0 distance is really disturbing.
+ */
+int
+latloncmp(const string & inputlat1, const string & inputlon1,
+ const string & inputlat2, const string & inputlon2 ) {
-
-int latloncmp(const string & inputlat1, const string & inputlon1,
- const string & inputlat2, const string & inputlon2 ){
-
- static const double R = 3963.0; //radius of the earth is 6378.1km = 3963 miles
- static const double pi = 3.1415926;
- //rad = degree * pi / 180
- static const double DEG2RAD = pi / 180 ;
+ static const double R = 3963.0; //radius of the earth is 6378.1km = 3963 miles
+ static const double pi = 3.1415926;
+ //rad = degree * pi / 180
+ static const double DEG2RAD = pi / 180 ;
const double lat1 = atof(inputlat1.c_str());
const double lon1 = atof(inputlon1.c_str());
@@ -436,10 +454,10 @@ int latloncmp(const string & inputlat1, const string & inputlon1,
const double missing_val = 0.0001;
int missing = ( ( fabs(lat1) < missing_val && fabs(lon1) < missing_val ) ||
- ( fabs(lat2) < missing_val && fabs(lon2) < missing_val) ) ? 1 : 0;
+ ( fabs(lat2) < missing_val && fabs(lon2) < missing_val) ) ? 1 : 0;
if ( missing )
- return 1;
+ return 1;
const double radlat1 = lat1 * DEG2RAD;
const double radlon1 = lon1 * DEG2RAD;
@@ -476,32 +494,38 @@ int latloncmp(const string & inputlat1, const string & inputlon1,
if ( dist < 1.0 )
- return 5;
+ return 5;
else if ( dist < 10 )
- return 4;
+ return 4;
else if ( dist < 25)
- return 3;
+ return 3;
else if ( dist < 50 )
- return 2;
+ return 2;
else
- return 1;
-
+ return 1;
}
-int streetcmp(const string& inputstreet1, const string& inputstreet2) {
- int streetmatch = ( inputstreet1.size() != 0 && inputstreet2.size() != 0 && (inputstreet1 == inputstreet2 )) ? 1 : 0;
+
+int
+streetcmp(const string & inputstreet1, const string & inputstreet2) {
+
+ int streetmatch = ( inputstreet1.size() != 0 && inputstreet2.size() != 0
+ && (inputstreet1 == inputstreet2 )) ? 1 : 0;
return streetmatch;
}
-int countrycmp(const string & country1, const string & country2 ) {
- static const string US_label ("US");
- int score = 0;
- if ( country1 == country2 ) {
- ++score;
- if ( country1 == US_label)
- ++score;
- }
- return score;
+
+int
+countrycmp(const string & country1, const string & country2 ) {
+
+ static const string US_label ("US");
+ int score = 0;
+ if ( country1 == country2 ) {
+ ++score;
+ if ( country1 == US_label)
+ ++score;
+ }
+ return score;
}
@@ -512,7 +536,7 @@ classcmp(const string & class1, const string & class2) {
}
-int
+int
coauthorcmp(const string & coauthor1, const string & coauthor2) {
return (coauthor1 == coauthor2) ? 1 : 0;
@@ -573,7 +597,7 @@ asgcmp_to_test(const vector <string> & asg1,
map<string, std::pair<string, unsigned int> >::const_iterator p1, p2;
p1 = asg_table_pointer->find(asg1.at(0));
p2 = asg_table_pointer->find(asg2.at(0));
-
+
if ( p1 == asg_table_pointer->end() || p2 == asg_table_pointer->end() ) {
std::cout << "Error: either assignee is not found in the assignee tree. "
<< asg1.at(0) << " or " << asg2.at(0) << std::endl;
@@ -597,13 +621,13 @@ asgcmp_to_test(const vector <string> & asg1,
for ( ++q1; q1 != asg1.end(); ++q1 )
vec_asg1.push_back(&(*q1));
-
+
vector < const string * > vec_asg2;
vector < string >::const_iterator q2 = asg2.begin();
for ( ++q2; q2 != asg2.end(); ++q2 )
vec_asg2.push_back(&(*q2));
-
+
score = Longest_Common_Subsequence_Incontinuous <const string *, cSentence_JWComparator>(vec_asg1, vec_asg2, sjw).size();
}
return score;
View
1  test/.gitignore
@@ -1,3 +1,4 @@
+comparators
stringmanipulator
attribute
blocking
View
4 test/Makefile.am
@@ -5,7 +5,8 @@ LDADD = ../src/libdisambiguation.a
AM_LDFLAGS = $(shell cppunit-config --libs)
-TESTS = distest idtest typedefs record blocking attribute stringmanipulator
+TESTS = distest idtest typedefs record blocking attribute stringmanipulator \
+ comparators
bin_PROGRAMS = $(TESTS)
distest_SOURCES = test_initial.cpp
@@ -13,5 +14,6 @@ idtest_SOURCES = test_uniqueid.cpp
typedefs_SOURCES = test_typedefs.cpp
record_SOURCES = test_record.cpp
attribute_SOURCES = test_attribute.cpp
+comparators_SOURCES = test_comparators.cpp
blocking_SOURCES = test_blocking.cpp
stringmanipulator_SOURCES = test_string_manipulator.cpp
View
2  test/test_blocking.cpp
@@ -8,7 +8,7 @@
#include <DisambigEngine.h>
#include <DisambigCluster.h>
-#include <DisambigComp.h>
+//#include <DisambigComp.h>
#include <attribute.h>
#include <DisambigEngine.h>
#include <DisambigFileOper.h>
View
56 test/test_comparators.cpp
@@ -0,0 +1,56 @@
+
+#include <cppunit/Portability.h>
+#include <cppunit/portability/CppUnitSet.h>
+#include <cppunit/extensions/TestFactory.h>
+#include <cppunit/TestCase.h>
+#include <string>
+using std::string;
+
+#include <comparators.h>
+
+
+class ComparatorsTest : public CppUnit::TestCase {
+
+public:
+ ComparatorsTest(std::string name) : CppUnit::TestCase(name) {}
+
+ /**
+ * This is insane to have a distance measure
+ * which returns anything other than zero for
+ * zero distance. See comments in source code.
+ */
+ void test_zero() {
+ string lat1("0.0");
+ string lon1("0.0");
+ string lat2("0.0");
+ string lon2("0.0");
+ int distance = latloncmp(lat1, lon1, lat2, lon2);
+ CPPUNIT_ASSERT(distance == 1);
+ }
+
+ void test_latloncmp() {
+ string lat1("38.38");
+ string lon1("102.102");
+ string lat2("38.38");
+ string lon2("102.102");
+ int distance = latloncmp(lat1, lon1, lat2, lon2);
+ CPPUNIT_ASSERT(distance == 5);
+ }
+
+ void runTest() {
+ // Just o get startes...
+ CPPUNIT_ASSERT( 1 == 1 );
+ //delete_attribute();
+ }
+};
+
+int
+main(int argc, char ** argv) {
+
+ ComparatorsTest * ct = new ComparatorsTest(std::string("initial test"));
+ ct->runTest();
+ ct->test_latloncmp();
+ ct->test_zero();
+ delete ct;
+ return 0;
+}
View
2  test/test_typedefs.cpp
@@ -8,7 +8,7 @@
#include <DisambigEngine.h>
#include <DisambigCluster.h>
-#include <DisambigComp.h>
+//#include <DisambigComp.h>
#include <attribute.h>
#include <DisambigEngine.h>
#include <DisambigFileOper.h>
View
2  test/test_uniqueid.cpp
@@ -8,7 +8,7 @@
#include <DisambigEngine.h>
#include <DisambigCluster.h>
-#include <DisambigComp.h>
+//#include <comparators.h>
#include <attribute.h>
#include <DisambigEngine.h>
#include <DisambigFileOper.h>

No commit comments for this range

Something went wrong with that request. Please try again.