Skip to content

Commit

Permalink
disallow data in proto-languages and malformed languages
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Speer committed Aug 22, 2014
1 parent 3afb172 commit 72556da
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
3 changes: 2 additions & 1 deletion DATA-CREDITS.txt
Expand Up @@ -126,7 +126,8 @@ It also uses data from Wikipedia, the free encyclopedia [wikipedia] via DBPedia
[dbpedia].

Wiktionary and Wikipedia are collaborative projects, authored by their
respective online communities. They are currently released under the Creative Commons Attribution-ShareAlike license [CC-By-SA-3].
respective online communities. They are currently released under the Creative
Commons Attribution-ShareAlike license [CC-By-SA-3].

Wikimedia encourages giving attribution by providing links to the hosted pages
that the data came from, and DBPedia asks for the same thing in turn. In the
Expand Down
16 changes: 15 additions & 1 deletion conceptnet5/wiktparse/rules.py
Expand Up @@ -115,6 +115,12 @@ class EdgeInfo(object):
are global to the Wiktionary entry we're parsing. We also don't represent
the word sense of the target word, because we never know what it is.
"""
# until the time in the near future when we can actually check language
# codes for validity, here's a good estimate. It should start with a
# lowercase letter (so it's not an etymological abbreviation like LL),
# and it should contain only letters and dashes.
LANGUAGE_CODE_RE = re.compile(r'[a-z][-A-Za-z]+')

def __init__(self, language, target, sense=None, rel=None):
self.language = language
self.target = target
Expand All @@ -141,6 +147,14 @@ def set_sense(self, sense):
def set_rel(self, rel):
return EdgeInfo(self.language, self.target, self.sense, rel)

def check_validity(self):
return (
self.target not in BAD_NAMES_FOR_THINGS
and not self.target.startswith('*')
and not self.language.endswith('-pro')
and self.LANGUAGE_CODE_RE.match(self.language)
)

def complete_edge(self, source_lang, rule_name, headlang, headword,
headpos=None):
if headpos is None:
Expand Down Expand Up @@ -372,7 +386,7 @@ def parse_structured_section(self, structure, headlang, headword,
[ei.complete_edge(self.default_language, rule, headlang,
headword, headpos)
for ei in edge_info
if ei.target not in BAD_NAMES_FOR_THINGS
if ei.check_validity()
and ei.language is not None]
)

Expand Down

0 comments on commit 72556da

Please sign in to comment.