datquocnguyen · bact · Nov 16, 2019 · Nov 16, 2019 · Nov 17, 2019 · Nov 17, 2019
diff --git a/InitialTagger/InitialTagger.py b/InitialTagger/InitialTagger.py
@@ -2,32 +2,35 @@
 
 import re
 
+
 def initializeSentence(FREQDICT, sentence):
     words = sentence.strip().split()
     taggedSen = []
     for word in words:
-        if word in ["“", "”", "\""]:
-            #taggedSen.append("''/" + FREQDICT["''"])
+        if word in ["“", "”", '"']:
+            # taggedSen.append("''/" + FREQDICT["''"])
             if "''" in FREQDICT:
                 taggedSen.append("''/" + FREQDICT["''"])
             elif "." in FREQDICT:
                 taggedSen.append("''/" + FREQDICT["."])
             elif "," in FREQDICT:
                 taggedSen.append("''/" + FREQDICT[","])
             else:
-                print("\n'' is not in the dictionary \nManually add '' with a possible POS tag into the .DICT file!")
-                taggedSen.append("''/" + FREQDICT["''"])   
+                print(
+                    "\n'' is not in the dictionary \nManually add '' with a possible POS tag into the .DICT file!"
+                )
+                taggedSen.append("''/" + FREQDICT["''"])
             continue
-        
-        tag = ''
+
+        tag = ""
         decodedW = word
         lowerW = decodedW.lower()
         if word in FREQDICT:
             tag = FREQDICT[word]
         elif lowerW in FREQDICT:
             tag = FREQDICT[lowerW]
         else:
-            if re.search(r"[0-9]+", word) != None:
+            if re.search(r"[0-9]+", word) is not None:
                 tag = FREQDICT["TAG4UNKN-NUM"]
             else:
                 suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
@@ -39,11 +42,11 @@ def initializeSentence(FREQDICT, sentence):
                     suffixL4 = ".*" + decodedW[-4:]
                 if wLength >= 6:
                     suffixL5 = ".*" + decodedW[-5:]
-                
+
                 if suffixL5 in FREQDICT:
                     tag = FREQDICT[suffixL5]
                 elif suffixL4 in FREQDICT:
-                    tag = FREQDICT[suffixL4] 
+                    tag = FREQDICT[suffixL4]
                 elif suffixL3 in FREQDICT:
                     tag = FREQDICT[suffixL3]
                 elif suffixL2 in FREQDICT:
@@ -52,14 +55,15 @@ def initializeSentence(FREQDICT, sentence):
                     tag = FREQDICT["TAG4UNKN-CAPITAL"]
                 else:
                     tag = FREQDICT["TAG4UNKN-WORD"]
-           
-        taggedSen.append(word + "/" + tag)                                
-    
+
+        taggedSen.append(word + "/" + tag)
+
     return " ".join(taggedSen)
 
+
 def initializeCorpus(FREQDICT, inputFile, outputFile):
     lines = open(inputFile, "r").readlines()
-    fileOut = open(outputFile, "w")
-    for line in lines:
-        fileOut.write(initializeSentence(FREQDICT, line) + "\n")
-    fileOut.close()
+
+    with open(outputFile, "w") as fileOut:
+        for line in lines:
+            fileOut.write(initializeSentence(FREQDICT, line) + "\n")
diff --git a/InitialTagger/InitialTagger4En.py b/InitialTagger/InitialTagger4En.py
@@ -2,52 +2,71 @@
 
 import re
 
+
 def initializeEnSentence(FREQDICT, sentence):
     words = sentence.strip().split()
     taggedSen = []
     for word in words:
-        if word in ["“", "”", "\""]:
+        if word in ["“", "”", '"']:
             taggedSen.append("''/" + FREQDICT["''"])
             continue
-        
-        tag = ''
+
+        tag = ""
         lowerW = word.lower()
         if word in FREQDICT:
-            tag = FREQDICT[word] 
+            tag = FREQDICT[word]
         elif lowerW in FREQDICT:
-            tag = FREQDICT[lowerW] 
+            tag = FREQDICT[lowerW]
         else:
-            if (re.search(r"([0-9]+-)|(-[0-9]+)", word) != None):
+            if re.search(r"([0-9]+-)|(-[0-9]+)", word) is not None:
                 tag = "JJ"
-            elif (re.search(r"[0-9]+", word) != None):
+            elif re.search(r"[0-9]+", word) is not None:
                 tag = "CD"
-            elif (re.search(r'(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)', word) != None):
+            elif (
+                re.search(
+                    r"(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)",
+                    word,
+                )
+                is not None
+            ):
                 tag = "NN"
-            elif (re.search(r'.*s$', word) != None and word[0].islower()):
+            elif re.search(r".*s$", word) is not None and word[0].islower():
                 tag = "NNS"
-            elif (word[0].isupper()):
+            elif word[0].isupper():
                 tag = "NNP"
-            elif(re.search(r'(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)', word) != None):
+            elif (
+                re.search(
+                    r"(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)", word
+                )
+                is not None
+            ):
                 tag = "JJ"
-            elif (re.search(r'.*ing$', word) != None and word.find("-") < 0):
+            elif re.search(r".*ing$", word) is not None and word.find("-") < 0:
                 tag = "VBG"
-            elif (re.search(r'.*ed$', word) != None and word.find("-") < 0):
+            elif re.search(r".*ed$", word) is not None and word.find("-") < 0:
                 tag = "VBN"
-            elif (re.search(r'(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)', word) != None
-                  or word.find("-") > -1):
+            elif (
+                re.search(
+                    r"(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)",
+                    word,
+                )
+                is not None
+                or word.find("-") > -1
+            ):
                 tag = "JJ"
-            elif(re.search(r'.*ly$', word) != None):
+            elif re.search(r".*ly$", word) is not None:
                 tag = "RB"
             else:
-                tag = "NN" 
-                    
+                tag = "NN"
+
         taggedSen.append(word + "/" + tag)
-                                           
+
     return " ".join(taggedSen)
 
+
 def initializeEnCorpus(FREQDICT, inputFile, outputFile):
     lines = open(inputFile, "r").readlines()
-    fileOut = open(outputFile, "w")
-    for line in lines:
-        fileOut.write(initializeEnSentence(FREQDICT, line) + "\n")
-    fileOut.close()
+
+    with open(outputFile, "w") as fileOut:
+        for line in lines:
+            fileOut.write(initializeEnSentence(FREQDICT, line) + "\n")
diff --git a/InitialTagger/InitialTagger4Vn.py b/InitialTagger/InitialTagger4Vn.py
@@ -2,95 +2,98 @@
 
 import re
 
-def isAbbre(word):
 
-    #word = unicode(word, "utf-8")
-    for i in range(len(word)):
-        if isVnLowerChar(word[i]) or word[i] == "_":
-            return False
-    return True
+VNUPPERCHARS = [u"Ă", u"Â", u"Đ", u"Ê", u"Ô", u"Ơ", u"Ư"]
+VNLOWERCHARS = [u"ă", u"â", u"đ", u"ê", u"ô", u"ơ", u"ư"]
 
-VNUPPERCHARS = [u'Ă', u'Â', u'Đ', u'Ê', u'Ô', u'Ơ', u'Ư']
-VNLOWERCHARS = [u'ă', u'â', u'đ', u'ê', u'ô', u'ơ', u'ư']
 
 def isVnLowerChar(char):
     if char.islower() or char in VNLOWERCHARS:
-        return True;
-    return False;
+        return True
+    return False
+
 
 def isVnUpperChar(char):
     if char.isupper() or char in VNUPPERCHARS:
-        return True;
-    return False;
+        return True
+    return False
+
+
+def isAbbre(word):
+    for i in range(len(word)):
+        if isVnLowerChar(word[i]) or word[i] == "_":
+            return False
+    return True
+
 
 def isVnProperNoun(word):
-    #word = unicode(word, "utf-8")
-    if (isVnUpperChar(word[0])):
+    if isVnUpperChar(word[0]):
         if word.count("_") >= 4:
             return True
         index = word.find("_")
         while index > 0 and index < len(word) - 1:
             if isVnLowerChar(word[index + 1]):
-                return False;
+                return False
             index = word.find("_", index + 1)
-        return True;
+        return True
     else:
-        return False;
+        return False
+
 
 def initializeVnSentence(FREQDICT, sentence):
     words = sentence.strip().split()
     taggedSen = []
     for word in words:
-        if word in ["“", "”", "\""]:
+        if word in ["“", "”", '"']:
             taggedSen.append("''/" + FREQDICT["''"])
             continue
-        
-        tag = ''
+
+        tag = ""
         decodedW = word
         lowerW = decodedW.lower()
         if word in FREQDICT:
             tag = FREQDICT[word]
         elif lowerW in FREQDICT:
             tag = FREQDICT[lowerW]
-        else:         
-                if (re.search(r"[0-9]+", word) != None):
-                    tag = FREQDICT["TAG4UNKN-NUM"]
-                elif(len(word) == 1 and isVnUpperChar(word[0])):
-                    tag = "Y"
-                elif (isAbbre(word)):
-                    tag = "Ny"
-                elif (isVnProperNoun(word)):
-                    tag = "Np"
+        else:
+            if re.search(r"[0-9]+", word) is not None:
+                tag = FREQDICT["TAG4UNKN-NUM"]
+            elif len(word) == 1 and isVnUpperChar(word[0]):
+                tag = "Y"
+            elif isAbbre(word):
+                tag = "Ny"
+            elif isVnProperNoun(word):
+                tag = "Np"
+            else:
+                suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
+                wLength = len(decodedW)
+                if wLength >= 4:
+                    suffixL3 = ".*" + decodedW[-3:]
+                    suffixL2 = ".*" + decodedW[-2:]
+                if wLength >= 5:
+                    suffixL4 = ".*" + decodedW[-4:]
+                if wLength >= 6:
+                    suffixL5 = ".*" + decodedW[-5:]
+
+                if suffixL5 in FREQDICT:
+                    tag = FREQDICT[suffixL5]
+                elif suffixL4 in FREQDICT:
+                    tag = FREQDICT[suffixL4]
+                elif suffixL3 in FREQDICT:
+                    tag = FREQDICT[suffixL3]
+                elif suffixL2 in FREQDICT:
+                    tag = FREQDICT[suffixL2]
                 else:
-                    suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
-                    wLength = len(decodedW)
-                    if wLength >= 4:
-                        suffixL3 = ".*" + decodedW[-3:]
-                        suffixL2 = ".*" + decodedW[-2:]
-                    if wLength >= 5:
-                        suffixL4 = ".*" + decodedW[-4:]
-                    if wLength >= 6:
-                        suffixL5 = ".*" + decodedW[-5:]
-
-                    if suffixL5 in FREQDICT:
-                        tag = FREQDICT[suffixL5]
-                    elif suffixL4 in FREQDICT:
-                        tag = FREQDICT[suffixL4]
-                    elif suffixL3 in FREQDICT:
-                        tag = FREQDICT[suffixL3]
-                    elif suffixL2 in FREQDICT:
-                        tag = FREQDICT[suffixL2]
-                    else:
-                        tag = FREQDICT["TAG4UNKN-WORD"]
-
-        taggedSen.append(word + "/" + tag) 
-
+                    tag = FREQDICT["TAG4UNKN-WORD"]
+
+        taggedSen.append(word + "/" + tag)
+
     return " ".join(taggedSen)
 
+
 def initializeVnCorpus(FREQDICT, inputFile, outputFile):
     lines = open(inputFile, "r").readlines()
-    fileOut = open(outputFile, "w")
-    for line in lines:
-        fileOut.write(initializeVnSentence(FREQDICT, line) + "\n")
-    fileOut.close()
 
+    with open(outputFile, "w") as fileOut:
+        for line in lines:
+            fileOut.write(initializeVnSentence(FREQDICT, line) + "\n")