Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follows PEP 8 Python code convention and format #20

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 20 additions & 16 deletions InitialTagger/InitialTagger.py
Expand Up @@ -2,32 +2,35 @@

import re


def initializeSentence(FREQDICT, sentence):
words = sentence.strip().split()
taggedSen = []
for word in words:
if word in ["“", "”", "\""]:
#taggedSen.append("''/" + FREQDICT["''"])
if word in ["“", "”", '"']:
# taggedSen.append("''/" + FREQDICT["''"])
if "''" in FREQDICT:
taggedSen.append("''/" + FREQDICT["''"])
elif "." in FREQDICT:
taggedSen.append("''/" + FREQDICT["."])
elif "," in FREQDICT:
taggedSen.append("''/" + FREQDICT[","])
else:
print("\n'' is not in the dictionary \nManually add '' with a possible POS tag into the .DICT file!")
taggedSen.append("''/" + FREQDICT["''"])
print(
"\n'' is not in the dictionary \nManually add '' with a possible POS tag into the .DICT file!"
)
taggedSen.append("''/" + FREQDICT["''"])
continue
tag = ''

tag = ""
decodedW = word
lowerW = decodedW.lower()
if word in FREQDICT:
tag = FREQDICT[word]
elif lowerW in FREQDICT:
tag = FREQDICT[lowerW]
else:
if re.search(r"[0-9]+", word) != None:
if re.search(r"[0-9]+", word) is not None:
tag = FREQDICT["TAG4UNKN-NUM"]
else:
suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
Expand All @@ -39,11 +42,11 @@ def initializeSentence(FREQDICT, sentence):
suffixL4 = ".*" + decodedW[-4:]
if wLength >= 6:
suffixL5 = ".*" + decodedW[-5:]

if suffixL5 in FREQDICT:
tag = FREQDICT[suffixL5]
elif suffixL4 in FREQDICT:
tag = FREQDICT[suffixL4]
tag = FREQDICT[suffixL4]
elif suffixL3 in FREQDICT:
tag = FREQDICT[suffixL3]
elif suffixL2 in FREQDICT:
Expand All @@ -52,14 +55,15 @@ def initializeSentence(FREQDICT, sentence):
tag = FREQDICT["TAG4UNKN-CAPITAL"]
else:
tag = FREQDICT["TAG4UNKN-WORD"]
taggedSen.append(word + "/" + tag)

taggedSen.append(word + "/" + tag)

return " ".join(taggedSen)


def initializeCorpus(FREQDICT, inputFile, outputFile):
lines = open(inputFile, "r").readlines()
fileOut = open(outputFile, "w")
for line in lines:
fileOut.write(initializeSentence(FREQDICT, line) + "\n")
fileOut.close()

with open(outputFile, "w") as fileOut:
for line in lines:
fileOut.write(initializeSentence(FREQDICT, line) + "\n")
65 changes: 42 additions & 23 deletions InitialTagger/InitialTagger4En.py
Expand Up @@ -2,52 +2,71 @@

import re


def initializeEnSentence(FREQDICT, sentence):
words = sentence.strip().split()
taggedSen = []
for word in words:
if word in ["“", "”", "\""]:
if word in ["“", "”", '"']:
taggedSen.append("''/" + FREQDICT["''"])
continue
tag = ''

tag = ""
lowerW = word.lower()
if word in FREQDICT:
tag = FREQDICT[word]
tag = FREQDICT[word]
elif lowerW in FREQDICT:
tag = FREQDICT[lowerW]
tag = FREQDICT[lowerW]
else:
if (re.search(r"([0-9]+-)|(-[0-9]+)", word) != None):
if re.search(r"([0-9]+-)|(-[0-9]+)", word) is not None:
tag = "JJ"
elif (re.search(r"[0-9]+", word) != None):
elif re.search(r"[0-9]+", word) is not None:
tag = "CD"
elif (re.search(r'(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)', word) != None):
elif (
re.search(
r"(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)",
word,
)
is not None
):
tag = "NN"
elif (re.search(r'.*s$', word) != None and word[0].islower()):
elif re.search(r".*s$", word) is not None and word[0].islower():
tag = "NNS"
elif (word[0].isupper()):
elif word[0].isupper():
tag = "NNP"
elif(re.search(r'(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)', word) != None):
elif (
re.search(
r"(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)", word
)
is not None
):
tag = "JJ"
elif (re.search(r'.*ing$', word) != None and word.find("-") < 0):
elif re.search(r".*ing$", word) is not None and word.find("-") < 0:
tag = "VBG"
elif (re.search(r'.*ed$', word) != None and word.find("-") < 0):
elif re.search(r".*ed$", word) is not None and word.find("-") < 0:
tag = "VBN"
elif (re.search(r'(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)', word) != None
or word.find("-") > -1):
elif (
re.search(
r"(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)",
word,
)
is not None
or word.find("-") > -1
):
tag = "JJ"
elif(re.search(r'.*ly$', word) != None):
elif re.search(r".*ly$", word) is not None:
tag = "RB"
else:
tag = "NN"
tag = "NN"

taggedSen.append(word + "/" + tag)

return " ".join(taggedSen)


def initializeEnCorpus(FREQDICT, inputFile, outputFile):
lines = open(inputFile, "r").readlines()
fileOut = open(outputFile, "w")
for line in lines:
fileOut.write(initializeEnSentence(FREQDICT, line) + "\n")
fileOut.close()

with open(outputFile, "w") as fileOut:
for line in lines:
fileOut.write(initializeEnSentence(FREQDICT, line) + "\n")
115 changes: 59 additions & 56 deletions InitialTagger/InitialTagger4Vn.py
Expand Up @@ -2,95 +2,98 @@

import re

def isAbbre(word):

#word = unicode(word, "utf-8")
for i in range(len(word)):
if isVnLowerChar(word[i]) or word[i] == "_":
return False
return True
VNUPPERCHARS = [u"Ă", u"Â", u"Đ", u"Ê", u"Ô", u"Ơ", u"Ư"]
VNLOWERCHARS = [u"ă", u"â", u"đ", u"ê", u"ô", u"ơ", u"ư"]

VNUPPERCHARS = [u'Ă', u'Â', u'Đ', u'Ê', u'Ô', u'Ơ', u'Ư']
VNLOWERCHARS = [u'ă', u'â', u'đ', u'ê', u'ô', u'ơ', u'ư']

def isVnLowerChar(char):
if char.islower() or char in VNLOWERCHARS:
return True;
return False;
return True
return False


def isVnUpperChar(char):
if char.isupper() or char in VNUPPERCHARS:
return True;
return False;
return True
return False


def isAbbre(word):
for i in range(len(word)):
if isVnLowerChar(word[i]) or word[i] == "_":
return False
return True


def isVnProperNoun(word):
#word = unicode(word, "utf-8")
if (isVnUpperChar(word[0])):
if isVnUpperChar(word[0]):
if word.count("_") >= 4:
return True
index = word.find("_")
while index > 0 and index < len(word) - 1:
if isVnLowerChar(word[index + 1]):
return False;
return False
index = word.find("_", index + 1)
return True;
return True
else:
return False;
return False


def initializeVnSentence(FREQDICT, sentence):
words = sentence.strip().split()
taggedSen = []
for word in words:
if word in ["“", "”", "\""]:
if word in ["“", "”", '"']:
taggedSen.append("''/" + FREQDICT["''"])
continue
tag = ''

tag = ""
decodedW = word
lowerW = decodedW.lower()
if word in FREQDICT:
tag = FREQDICT[word]
elif lowerW in FREQDICT:
tag = FREQDICT[lowerW]
else:
if (re.search(r"[0-9]+", word) != None):
tag = FREQDICT["TAG4UNKN-NUM"]
elif(len(word) == 1 and isVnUpperChar(word[0])):
tag = "Y"
elif (isAbbre(word)):
tag = "Ny"
elif (isVnProperNoun(word)):
tag = "Np"
else:
if re.search(r"[0-9]+", word) is not None:
tag = FREQDICT["TAG4UNKN-NUM"]
elif len(word) == 1 and isVnUpperChar(word[0]):
tag = "Y"
elif isAbbre(word):
tag = "Ny"
elif isVnProperNoun(word):
tag = "Np"
else:
suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
wLength = len(decodedW)
if wLength >= 4:
suffixL3 = ".*" + decodedW[-3:]
suffixL2 = ".*" + decodedW[-2:]
if wLength >= 5:
suffixL4 = ".*" + decodedW[-4:]
if wLength >= 6:
suffixL5 = ".*" + decodedW[-5:]

if suffixL5 in FREQDICT:
tag = FREQDICT[suffixL5]
elif suffixL4 in FREQDICT:
tag = FREQDICT[suffixL4]
elif suffixL3 in FREQDICT:
tag = FREQDICT[suffixL3]
elif suffixL2 in FREQDICT:
tag = FREQDICT[suffixL2]
else:
suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
wLength = len(decodedW)
if wLength >= 4:
suffixL3 = ".*" + decodedW[-3:]
suffixL2 = ".*" + decodedW[-2:]
if wLength >= 5:
suffixL4 = ".*" + decodedW[-4:]
if wLength >= 6:
suffixL5 = ".*" + decodedW[-5:]

if suffixL5 in FREQDICT:
tag = FREQDICT[suffixL5]
elif suffixL4 in FREQDICT:
tag = FREQDICT[suffixL4]
elif suffixL3 in FREQDICT:
tag = FREQDICT[suffixL3]
elif suffixL2 in FREQDICT:
tag = FREQDICT[suffixL2]
else:
tag = FREQDICT["TAG4UNKN-WORD"]

taggedSen.append(word + "/" + tag)

tag = FREQDICT["TAG4UNKN-WORD"]

taggedSen.append(word + "/" + tag)

return " ".join(taggedSen)


def initializeVnCorpus(FREQDICT, inputFile, outputFile):
lines = open(inputFile, "r").readlines()
fileOut = open(outputFile, "w")
for line in lines:
fileOut.write(initializeVnSentence(FREQDICT, line) + "\n")
fileOut.close()

with open(outputFile, "w") as fileOut:
for line in lines:
fileOut.write(initializeVnSentence(FREQDICT, line) + "\n")