Skip to content

Commit

Permalink
PEPed 8
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Jul 23, 2016
1 parent 767e908 commit 18c1ba2
Show file tree
Hide file tree
Showing 11 changed files with 513 additions and 772 deletions.
334 changes: 217 additions & 117 deletions BREDS-parallel.py

Large diffs are not rendered by default.

186 changes: 119 additions & 67 deletions BREDS-single.py

Large diffs are not rendered by default.

23 changes: 14 additions & 9 deletions BREDS/Config.py
@@ -1,9 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = "David S. Batista"
__email__ = "dsbatista@inesc-id.pt"

import fileinput
import re

Expand All @@ -13,10 +10,14 @@
from BREDS.Seed import Seed
from BREDS.ReVerb import Reverb

__author__ = "David S. Batista"
__email__ = "dsbatista@inesc-id.pt"


class Config(object):

def __init__(self, config_file, positive_seeds, negative_seeds, similarity, confidance):
def __init__(self, config_file, positive_seeds, negative_seeds,
similarity, confidence):

# http://www.ling.upenn.edu/courses/Fall_2007/ling001/penn_treebank_pos.html
# select everything except stopwords, ADJ and ADV
Expand All @@ -33,15 +34,17 @@ def __init__(self, config_file, positive_seeds, negative_seeds, similarity, conf
self.stopwords = stopwords.words('english')
self.lmtzr = WordNetLemmatizer()
self.threshold_similarity = similarity
self.instance_confidance = confidance
self.instance_confidence = confidence
self.reverb = Reverb()
self.word2vec = None
self.vec_dim = None

# simple tags, e.g.: <PER>Bill Gates</PER>
# simple tags, e.g.:
# <PER>Bill Gates</PER>
self.regex_simple = re.compile('<[A-Z]+>[^<]+</[A-Z]+>', re.U)

#linked tags e.g.: <PER url=http://en.wikipedia.org/wiki/Mark_Zuckerberg>Zuckerberg</PER>
# linked tags e.g.:
# <PER url=http://en.wikipedia.org/wiki/Mark_Zuckerberg>Zuckerberg</PER>
self.regex_linked = re.compile('<[A-Z]+ url=[^>]+>[^<]+</[A-Z]+>', re.U)

for line in fileinput.input(config_file):
Expand Down Expand Up @@ -121,15 +124,17 @@ def __init__(self, config_file, positive_seeds, negative_seeds, similarity, conf

print "\nParameters and Thresholds"
print "threshold_similarity :", self.threshold_similarity
print "instance confidence :", self.instance_confidance
print "instance confidence :", self.instance_confidence
print "min_pattern_support :", self.min_pattern_support
print "iterations :", self.number_iterations
print "iteration wUpdt :", self.wUpdt
print "\n"

def read_word2vec(self):
print "Loading word2vec model ...\n"
self.word2vec = Word2Vec.load_word2vec_format(self.word2vecmodelpath, binary=True)
self.word2vec = Word2Vec.load_word2vec_format(
self.word2vecmodelpath, binary=True
)
self.vec_dim = self.word2vec.layer1_size
print self.vec_dim, "dimensions"

Expand Down
22 changes: 13 additions & 9 deletions BREDS/Pattern.py
@@ -1,11 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import uuid

__author__ = "David S. Batista"
__email__ = "dsbatista@inesc-id.pt"

import uuid


class Pattern(object):

Expand Down Expand Up @@ -34,21 +34,26 @@ def __cmp__(self, other):

def update_confidence(self, config):
if self.positive > 0:
self.confidence = (float(self.positive) / float(self.positive +
self.unknown * config.wUnk +
self.negative * config.wNeg))
self.confidence = (
float(self.positive) / float(self.positive +
self.unknown * config.wUnk +
self.negative * config.wNeg
)
)
elif self.positive == 0:
self.confidence = 0

def add_tuple(self, t):
self.tuples.add(t)

# put all tuples with BET vectors into a set so that comparision with repeated vectors is eliminated
# put all tuples with BET vectors into a set so that comparison
# with repeated vectors is eliminated
def merge_all_tuples_bet(self):
self.bet_uniques_vectors = set()
self.bet_uniques_words = set()
for t in self.tuples:
# transform numpy array into a tuple so it can be hashed and added into a set
# transform numpy array into a tuple
# so it can be hashed and added into a set
self.bet_uniques_vectors.add(tuple(t.bet_vector))
self.bet_uniques_words.add(t.bet_words)

Expand All @@ -64,7 +69,6 @@ def update_selectivity(self, t, config):
break

if matched_e1 is True and matched_both is False:
#print t.e1, '\t', t.e2, "->", t.bet_words
self.negative += 1

if matched_both is False:
Expand All @@ -76,4 +80,4 @@ def update_selectivity(self, t, config):
break

if matched_both is False:
self.unknown += 1
self.unknown += 1
74 changes: 45 additions & 29 deletions BREDS/ReVerb.py
@@ -1,17 +1,15 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-


__author__ = "David S. Batista"
__email__ = "dsbatista@inesc-id.pt"

import fileinput
import StringIO

from nltk import pos_tag, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag.mapping import map_tag
from nltk.tokenize.punkt import PunktWordTokenizer

__author__ = "David S. Batista"
__email__ = "dsbatista@inesc-id.pt"


class Reverb(object):
Expand Down Expand Up @@ -47,7 +45,7 @@ def extract_reverb_patterns(text):
"""

# split text into tokens
text_tokens = PunktWordTokenizer().tokenize(text)
text_tokens = word_tokenize(text)

# tag the sentence, using the default NTLK English tagger
# POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
Expand Down Expand Up @@ -84,7 +82,8 @@ def extract_reverb_patterns(text):
i += 1

# W = (noun | adj | adv | pron | det)
while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV', 'PRON', 'DET']:
while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV',
'PRON', 'DET']:
tmp.write(tags[i][0]+' ')
t = (tags[i][0], tags[i][1])
tmp_tags.append(t)
Expand All @@ -110,9 +109,10 @@ def extract_reverb_patterns_tagged_ptb(tagged_text):
http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf
"""

# The pattern limits the relation to be a verb (e.g., invented), a verb followed immediately by
# a preposition (e.g., located in), or a verb followed by nouns, adjectives, or adverbs ending in a preposition
# (e.g., has an atomic weight of).
# The pattern limits the relation to be a verb (e.g., invented),
# a verb followed immediately by a preposition (e.g., located in),
# or a verb followed by nouns, adjectives, or adverbs ending in a
# preposition (e.g., has an atomic weight of).

# V | V P | V W*P
# V = verb particle? adv?
Expand All @@ -125,7 +125,8 @@ def extract_reverb_patterns_tagged_ptb(tagged_text):
limit = len(tagged_text)-1
tags = tagged_text

verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP', 'VBP|TO', 'VBZ', 'VP']
verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP',
'VBP|TO', 'VBZ', 'VP']
adverb = ['RB', 'RBR', 'RBS', 'RB|RP', 'RB|VBG', 'WRB']
particule = ['POS', 'PRT', 'TO', 'RP']
noun = ['NN', 'NNP', 'NNPS', 'NNS', 'NN|NNS', 'NN|SYM', 'NN|VBG', 'NP']
Expand Down Expand Up @@ -176,11 +177,15 @@ def extract_reverb_patterns_tagged_ptb(tagged_text):
patterns_tags.append(tmp_tags)
i += 1

# Finally, if the pattern matches multiple adjacent sequences, we merge them into a single relation phrase
# (e.g.,wants to extend). This refinement enables the model to readily handle relation phrases containing
# multiple verbs.
# Finally, if the pattern matches multiple adjacent sequences, we merge
# them into a single relation phrase (e.g.,wants to extend).
#
# This refinement enables the model to readily handle relation phrases
# containing multiple verbs.

merged_patterns_tags = [item for sublist in patterns_tags for item in sublist]
merged_patterns_tags = [
item for sublist in patterns_tags for item in sublist
]
return merged_patterns_tags

@staticmethod
Expand All @@ -190,9 +195,10 @@ def extract_reverb_patterns_ptb(text):
http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf
"""

# The pattern limits the relation to be a verb (e.g., invented), a verb followed immediately by
# a preposition (e.g., located in), or a verb followed by nouns, adjectives, or adverbs ending in a preposition
# (e.g., has an atomic weight of).
# The pattern limits the relation to be a verb (e.g., invented),
# a verb followed immediately by a preposition (e.g., located in),
# or a verb followed by nouns, adjectives, or adverbs ending in a
# preposition (e.g., has an atomic weight of).

# V | V P | V W*P
# V = verb particle? adv?
Expand All @@ -211,7 +217,8 @@ def extract_reverb_patterns_ptb(text):
limit = len(tags_ptb)-1
tags = tags_ptb

verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP', 'VBP|TO', 'VBZ', 'VP']
verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP',
'VBP|TO', 'VBZ', 'VP']
adverb = ['RB', 'RBR', 'RBS', 'RB|RP', 'RB|VBG', 'WRB']
particule = ['POS', 'PRT', 'TO', 'RP']
noun = ['NN', 'NNP', 'NNPS', 'NNS', 'NN|NNS', 'NN|SYM', 'NN|VBG', 'NP']
Expand Down Expand Up @@ -260,36 +267,45 @@ def extract_reverb_patterns_ptb(text):
patterns_tags.append(tmp_tags)
i += 1

# Finally, if the pattern matches multiple adjacent sequences, we merge them into a single relation phrase
# (e.g.,wants to extend). This refinement enables the model to readily handle relation phrases containing
# multiple verbs.
# Finally, if the pattern matches multiple adjacent sequences, we merge
# them into a single relation phrase (e.g.,wants to extend).
# This refinement enables the model to readily handle relation
# phrases containing multiple verbs.

merged_patterns_tags = [item for sublist in patterns_tags for item in sublist]
merged_patterns_tags = [
item for sublist in patterns_tags for item in sublist
]
return merged_patterns_tags

def detect_passive_voice(self, pattern):
passive_voice = False
#TODO: há casos mais complexos, adjectivos ou adverbios pelo meio, por exemplo:

# TODO: there more complex exceptions, adjectives or adverbs in between
# (to be) + (adj|adv) + past_verb + by
# to be + past verb + by

if len(pattern) >= 3:
if pattern[0][1].startswith('V'):
verb = self.lmtzr.lemmatize(pattern[0][0], 'v')
if verb in self.aux_verbs:
if (pattern[1][1] == 'VBN' or pattern[1][1] == 'VBD') and pattern[-1][0] == 'by':
if (pattern[1][1] == 'VBN' or pattern[1][1] == 'VBD') \
and pattern[-1][0] == 'by':
passive_voice = True

# past verb + by
elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') and pattern[-1][0] == 'by':
elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') \
and pattern[-1][0] == 'by':
passive_voice = True

# past verb + by
elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') and pattern[-1][0] == 'by':
elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') \
and pattern[-1][0] == 'by':
passive_voice = True

# past verb + by
elif len(pattern) >= 2:
if (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') and pattern[-1][0] == 'by':
if (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') \
and pattern[-1][0] == 'by':
passive_voice = True

return passive_voice
Expand All @@ -311,4 +327,4 @@ def main():
fileinput.close()

if __name__ == "__main__":
main()
main()

0 comments on commit 18c1ba2

Please sign in to comment.