PEPed 8

davidsbatista · Jul 23, 2016 · 18c1ba2 · 18c1ba2
1 parent 767e908
commit 18c1ba2
Show file tree

Hide file tree

Showing 11 changed files with 513 additions and 772 deletions.
diff --git a/BREDS-parallel.py b/BREDS-parallel.py
diff --git a/BREDS-single.py b/BREDS-single.py
diff --git a/BREDS/Config.py b/BREDS/Config.py
@@ -1,9 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-__author__ = "David S. Batista"
-__email__ = "dsbatista@inesc-id.pt"
-
 import fileinput
 import re
 
@@ -13,10 +10,14 @@
 from BREDS.Seed import Seed
 from BREDS.ReVerb import Reverb
 
+__author__ = "David S. Batista"
+__email__ = "dsbatista@inesc-id.pt"
+
 
 class Config(object):
 
-    def __init__(self, config_file, positive_seeds, negative_seeds, similarity, confidance):
+    def __init__(self, config_file, positive_seeds, negative_seeds,
+                 similarity, confidence):
 
         # http://www.ling.upenn.edu/courses/Fall_2007/ling001/penn_treebank_pos.html
         # select everything except stopwords, ADJ and ADV
@@ -33,15 +34,17 @@ def __init__(self, config_file, positive_seeds, negative_seeds, similarity, conf
         self.stopwords = stopwords.words('english')
         self.lmtzr = WordNetLemmatizer()
         self.threshold_similarity = similarity
-        self.instance_confidance = confidance
+        self.instance_confidence = confidence
         self.reverb = Reverb()
         self.word2vec = None
         self.vec_dim = None
 
-        # simple tags, e.g.: <PER>Bill Gates</PER>
+        # simple tags, e.g.:
+        # <PER>Bill Gates</PER>
         self.regex_simple = re.compile('<[A-Z]+>[^<]+</[A-Z]+>', re.U)
 
-        #linked tags e.g.: <PER url=http://en.wikipedia.org/wiki/Mark_Zuckerberg>Zuckerberg</PER>
+        # linked tags e.g.:
+        # <PER url=http://en.wikipedia.org/wiki/Mark_Zuckerberg>Zuckerberg</PER>
         self.regex_linked = re.compile('<[A-Z]+ url=[^>]+>[^<]+</[A-Z]+>', re.U)
 
         for line in fileinput.input(config_file):
@@ -121,15 +124,17 @@ def __init__(self, config_file, positive_seeds, negative_seeds, similarity, conf
 
         print "\nParameters and Thresholds"
         print "threshold_similarity :", self.threshold_similarity
-        print "instance confidence  :", self.instance_confidance
+        print "instance confidence  :", self.instance_confidence
         print "min_pattern_support  :", self.min_pattern_support
         print "iterations           :", self.number_iterations
         print "iteration wUpdt      :", self.wUpdt
         print "\n"
 
     def read_word2vec(self):
         print "Loading word2vec model ...\n"
-        self.word2vec = Word2Vec.load_word2vec_format(self.word2vecmodelpath, binary=True)
+        self.word2vec = Word2Vec.load_word2vec_format(
+            self.word2vecmodelpath, binary=True
+        )
         self.vec_dim = self.word2vec.layer1_size
         print self.vec_dim, "dimensions"
 

diff --git a/BREDS/Pattern.py b/BREDS/Pattern.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import uuid
+
 __author__ = "David S. Batista"
 __email__ = "dsbatista@inesc-id.pt"
 
-import uuid
-
 
 class Pattern(object):
 
@@ -34,21 +34,26 @@ def __cmp__(self, other):
 
     def update_confidence(self, config):
         if self.positive > 0:
-            self.confidence = (float(self.positive) / float(self.positive +
-                                                            self.unknown * config.wUnk +
-                                                            self.negative * config.wNeg))
+            self.confidence = (
+                float(self.positive) / float(self.positive +
+                                             self.unknown * config.wUnk +
+                                             self.negative * config.wNeg
+                                             )
+            )
         elif self.positive == 0:
             self.confidence = 0
 
     def add_tuple(self, t):
         self.tuples.add(t)
 
-    # put all tuples with BET vectors into a set so that comparision with repeated vectors is eliminated
+    # put all tuples with BET vectors into a set so that comparison
+    # with repeated vectors is eliminated
     def merge_all_tuples_bet(self):
         self.bet_uniques_vectors = set()
         self.bet_uniques_words = set()
         for t in self.tuples:
-            # transform numpy array into a tuple so it can be hashed and added into a set
+            # transform numpy array into a tuple
+            # so it can be hashed and added into a set
             self.bet_uniques_vectors.add(tuple(t.bet_vector))
             self.bet_uniques_words.add(t.bet_words)
 
@@ -64,7 +69,6 @@ def update_selectivity(self, t, config):
                     break
 
         if matched_e1 is True and matched_both is False:
-            #print t.e1, '\t', t.e2, "->", t.bet_words
             self.negative += 1
 
         if matched_both is False:
@@ -76,4 +80,4 @@ def update_selectivity(self, t, config):
                         break
 
         if matched_both is False:
-            self.unknown += 1
+            self.unknown += 1
diff --git a/BREDS/ReVerb.py b/BREDS/ReVerb.py
@@ -1,17 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-
-__author__ = "David S. Batista"
-__email__ = "dsbatista@inesc-id.pt"
-
 import fileinput
 import StringIO
 
 from nltk import pos_tag, word_tokenize
 from nltk.stem.wordnet import WordNetLemmatizer
 from nltk.tag.mapping import map_tag
-from nltk.tokenize.punkt import PunktWordTokenizer
+
+__author__ = "David S. Batista"
+__email__ = "dsbatista@inesc-id.pt"
 
 
 class Reverb(object):
@@ -47,7 +45,7 @@ def extract_reverb_patterns(text):
         """
 
         # split text into tokens
-        text_tokens = PunktWordTokenizer().tokenize(text)
+        text_tokens = word_tokenize(text)
 
         # tag the sentence, using the default NTLK English tagger
         # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
@@ -84,7 +82,8 @@ def extract_reverb_patterns(text):
                     i += 1
 
                 # W = (noun | adj | adv | pron | det)
-                while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV', 'PRON', 'DET']:
+                while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV',
+                                                    'PRON', 'DET']:
                     tmp.write(tags[i][0]+' ')
                     t = (tags[i][0], tags[i][1])
                     tmp_tags.append(t)
@@ -110,9 +109,10 @@ def extract_reverb_patterns_tagged_ptb(tagged_text):
         http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf
         """
 
-        # The pattern limits the relation to be a verb (e.g., invented), a verb followed immediately by
-        # a preposition (e.g., located in), or a verb followed by nouns, adjectives, or adverbs ending in a preposition
-        # (e.g., has an atomic weight of).
+        # The pattern limits the relation to be a verb (e.g., invented),
+        # a verb followed immediately by a preposition (e.g., located in),
+        # or a verb followed by nouns, adjectives, or adverbs ending in a
+        # preposition (e.g., has an atomic weight of).
 
         # V | V P | V W*P
         # V = verb particle? adv?
@@ -125,7 +125,8 @@ def extract_reverb_patterns_tagged_ptb(tagged_text):
         limit = len(tagged_text)-1
         tags = tagged_text
 
-        verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP', 'VBP|TO', 'VBZ', 'VP']
+        verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP',
+                'VBP|TO', 'VBZ', 'VP']
         adverb = ['RB', 'RBR', 'RBS', 'RB|RP', 'RB|VBG', 'WRB']
         particule = ['POS', 'PRT', 'TO', 'RP']
         noun = ['NN', 'NNP', 'NNPS', 'NNS', 'NN|NNS', 'NN|SYM', 'NN|VBG', 'NP']
@@ -176,11 +177,15 @@ def extract_reverb_patterns_tagged_ptb(tagged_text):
                 patterns_tags.append(tmp_tags)
             i += 1
 
-        # Finally, if the pattern matches multiple adjacent sequences, we merge them into a single relation phrase
-        # (e.g.,wants to extend). This refinement enables the model to readily handle relation phrases containing
-        # multiple verbs.
+        # Finally, if the pattern matches multiple adjacent sequences, we merge
+        # them into a single relation phrase (e.g.,wants to extend).
+        #
+        # This refinement enables the model to readily handle relation phrases
+        # containing multiple verbs.
 
-        merged_patterns_tags = [item for sublist in patterns_tags for item in sublist]
+        merged_patterns_tags = [
+            item for sublist in patterns_tags for item in sublist
+            ]
         return merged_patterns_tags
 
     @staticmethod
@@ -190,9 +195,10 @@ def extract_reverb_patterns_ptb(text):
         http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf
         """
 
-        # The pattern limits the relation to be a verb (e.g., invented), a verb followed immediately by
-        # a preposition (e.g., located in), or a verb followed by nouns, adjectives, or adverbs ending in a preposition
-        # (e.g., has an atomic weight of).
+        # The pattern limits the relation to be a verb (e.g., invented),
+        # a verb followed immediately by a preposition (e.g., located in),
+        # or a verb followed by nouns, adjectives, or adverbs ending in a
+        # preposition (e.g., has an atomic weight of).
 
         # V | V P | V W*P
         # V = verb particle? adv?
@@ -211,7 +217,8 @@ def extract_reverb_patterns_ptb(text):
         limit = len(tags_ptb)-1
         tags = tags_ptb
 
-        verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP', 'VBP|TO', 'VBZ', 'VP']
+        verb = ['VB', 'VBD', 'VBD|VBN', 'VBG', 'VBG|NN', 'VBN', 'VBP',
+                'VBP|TO', 'VBZ', 'VP']
         adverb = ['RB', 'RBR', 'RBS', 'RB|RP', 'RB|VBG', 'WRB']
         particule = ['POS', 'PRT', 'TO', 'RP']
         noun = ['NN', 'NNP', 'NNPS', 'NNS', 'NN|NNS', 'NN|SYM', 'NN|VBG', 'NP']
@@ -260,36 +267,45 @@ def extract_reverb_patterns_ptb(text):
                 patterns_tags.append(tmp_tags)
             i += 1
 
-        # Finally, if the pattern matches multiple adjacent sequences, we merge them into a single relation phrase
-        # (e.g.,wants to extend). This refinement enables the model to readily handle relation phrases containing
-        # multiple verbs.
+        # Finally, if the pattern matches multiple adjacent sequences, we merge
+        # them into a single relation phrase (e.g.,wants to extend).
+        # This refinement enables the model to readily handle relation
+        # phrases containing multiple verbs.
 
-        merged_patterns_tags = [item for sublist in patterns_tags for item in sublist]
+        merged_patterns_tags = [
+            item for sublist in patterns_tags for item in sublist
+            ]
         return merged_patterns_tags
 
     def detect_passive_voice(self, pattern):
         passive_voice = False
-        #TODO: há casos mais complexos, adjectivos ou adverbios pelo meio, por exemplo:
+
+        # TODO: there more complex exceptions, adjectives or adverbs in between
         # (to be) + (adj|adv) + past_verb + by
         # to be + past verb + by
+
         if len(pattern) >= 3:
             if pattern[0][1].startswith('V'):
                 verb = self.lmtzr.lemmatize(pattern[0][0], 'v')
                 if verb in self.aux_verbs:
-                    if (pattern[1][1] == 'VBN' or pattern[1][1] == 'VBD') and pattern[-1][0] == 'by':
+                    if (pattern[1][1] == 'VBN' or pattern[1][1] == 'VBD') \
+                            and pattern[-1][0] == 'by':
                         passive_voice = True
 
                     # past verb + by
-                    elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') and pattern[-1][0] == 'by':
+                    elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') \
+                            and pattern[-1][0] == 'by':
                         passive_voice = True
 
                 # past verb + by
-                elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') and pattern[-1][0] == 'by':
+                elif (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') \
+                        and pattern[-1][0] == 'by':
                         passive_voice = True
 
         # past verb + by
         elif len(pattern) >= 2:
-            if (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') and pattern[-1][0] == 'by':
+            if (pattern[-2][1] == 'VBN' or pattern[-2][1] == 'VBD') \
+                    and pattern[-1][0] == 'by':
                 passive_voice = True
 
         return passive_voice
@@ -311,4 +327,4 @@ def main():
     fileinput.close()
 
 if __name__ == "__main__":
-    main()
+    main()