forked from lxa2015/lxa-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordbreaker.py
1244 lines (1065 loc) · 58.1 KB
/
wordbreaker.py
1
#!usr/bin/env python3# Word segmentation# John Goldsmith 2014-# code refactoring/optimization in progress, Jackson Lee, 7/6/2015# audrey 08/2015# TODO: In ReadBrokenCorpus(), separate additional punctuation marks from preceding word. e.g. commaimport osimport mathimport argparseimport sysimport jsonimport jsonpicklefrom pathlib import Pathimport copyimport timeimport datetimefrom latexTable_py3 import MakeLatexTablefrom lxa5lib import (get_language_corpus_datafolder, json_pdump, changeFilenameSuffix, stdout_list, load_config_for_command_line_help)def makeArgParser(configfilename="config.json"): language, \ corpus, \ datafolder, \ configtext = load_config_for_command_line_help(configfilename) parser = argparse.ArgumentParser( description="This program segments unbroken text into words.\n\n{}" .format(configtext), formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--config", help="configuration filename", type=str, default=configfilename) parser.add_argument("--ibase", help="iterative processing for this run will start at iteration # (ibase+1); use '0' for fresh start.", type=int, default=None) parser.add_argument("--itarget", help="last iteration to perform in this run;\n thus # cycles = itarget - ibase", type=int, default=None) parser.add_argument("--statefile", help="jsonpickle file needed to restore previous state in order to resume processing; not needed if ibase==0", type=str, default=None) parser.add_argument("--candidates", help="number of candidates per iteration", type=int, default=25) parser.add_argument("--corpuslinestoread", help="number of lines to read in from corpus (defaults to 'sys.maxsize' to read entire corpus)", type=int, default=sys.maxsize) parser.add_argument("--verbose", help="verbose output", type=bool, default=False) parser.add_argument("--language", help="Language name", type=str, default=None) parser.add_argument("--corpus", help="Corpus file to use", type=str, default=None) parser.add_argument("--datafolder", help="path of the data folder", type=str, default=None) return parserclass LexiconEntry: def __init__(self, key = "", count = 0): self.m_Key = key self.m_Count = count self.m_Frequency= 0.0 self.m_ParentWords = list() self.m_ChildWords = list() self.m_Suffix = False self.m_Stem = False self.m_CountRegister = list() def UpdateRegister(self, current_iteration): if len(self.m_CountRegister) > 0: last_count = self.m_CountRegister[-1][1] if self.m_Count != last_count or self.m_ChildWords != []: self.m_CountRegister.append((current_iteration, self.m_Count, self.m_ChildWords)) else: self.m_CountRegister.append((current_iteration, self.m_Count, self.m_ChildWords)) #self.m_Count = 0 def Display(self, outfile): print(self.m_Key, " ", "{:.7f}".format(self.m_Frequency), " ", "{:.7f}".format(-1* math.log(self.m_Frequency, 2)), file = outfile) if len(self.m_ParentWords) > 0: expression = "/".join( self.m_ParentWords ) #print("expression is ", expression) print("%s" % ("{:<}".format(expression)), file=outfile) for (iteration_number, count, childwords) in self.m_CountRegister: if childwords == []: #print("{0:6}".format(iteration_number), " ", "{0:10}".format(count), file=outfile) print("%6i %10s" % (iteration_number, "{:,}".format(count)), file=outfile) else: #print("{0:6}".format(iteration_number), " ", "{0:10}".format(count), ":<".format(childwords), file=outfile) print("%6i %10s %-100s" % (iteration_number, "{:,}".format(count), childwords), file=outfile)# ---------------------------------------------------------#class Lexicon: def __init__(self): self.m_LetterPlog = dict() self.m_EntryDict = dict() self.m_TrueDictionary = dict() self.m_DictionaryCost = 0 #in bits! self.m_Corpus = list() self.m_SizeOfLongestEntry = 0 self.m_CorpusCost = 0.0 self.m_ParsedCorpus = list() self.m_NumberOfHypothesizedRunningWords = 0 self.m_NumberOfTrueRunningWords = 0 self.m_BreakPointList = list() self.m_DeletionList = list() # these are the words that were nominated and then not used in any line-parses *at all*. self.m_DeletionDict = dict() # They never stop getting nominated. self.m_Break_based_RecallPrecisionHistory = list() self.m_Token_based_RecallPrecisionHistory = list() self.m_Type_based_RecallPrecisionHistory = list() self.m_DictionaryCostHistory = list() self.m_CorpusCostHistory = list() # ---------------------------------------------------------# #def AddEntry(self,key,count): # this_entry = LexiconEntry(key,count) # self.m_EntryDict[key] = this_entry # if len(key) > self.m_SizeOfLongestEntry: # self.m_SizeOfLongestEntry = len(key) # ---------------------------------------------------------# def AddEntry(self,key,new_entry): #was AddEntry(self,key,count) #this_entry = LexiconEntry(key,count) self.m_EntryDict[key] = copy.deepcopy(new_entry) if len(key) > self.m_SizeOfLongestEntry: self.m_SizeOfLongestEntry = len(key) return self.m_EntryDict[key] # ---------------------------------------------------------# # Found bug here July 5 2015: important, don't let it remove a singleton letter! John # don't del key-value pairs within still iterating through the pairs among the dict; fix by John Goldsmith July 6 2015 def FilterZeroCountEntries(self, outfile): TempDeletionList = list() for key, entry in self.m_EntryDict.items(): if len(key) == 1 and entry.m_Count==0: entry.m_Count = 1 continue if len(key)>1 and entry.m_Count == 0: # DO WE REALLY WANT TO DISALLOW ITS USE FOR THE FUTURE? # Transfer this entry from the EntryDict to the DeletionDict (Keep a list but don't delete yet!) TempDeletionList.append(key) self.m_DeletionDict[key] = entry for key in TempDeletionList: del self.m_EntryDict[key] print("---> Deleted ", key) print("---> Deleted ", key, file=outfile) # ---------------------------------------------------------# def UpdateLexEntryRegisters(self, iteration_number): for key, entry in self.m_EntryDict.items(): entry.UpdateRegister(iteration_number) # ---------------------------------------------------------# #-------------------------------------------------------# # Populate data structures for corpus and dictionary. # # Record truth for measuring performance. # # Compute initial values for metrics. # #-------------------------------------------------------# def ReadBrokenCorpus(self, infilepathname, outfile, howmuchcorpus): print("Name of data file: ", infilepathname) if not os.path.isfile(infilepathname): # Note: already checked at startup by get_language_corpus_datafolder() print("Warning:", infilepathname, "does not exist.") print("Check file paths and names.") sys.exit() infile = open(infilepathname) rawcorpus_list = infile.readlines() # bad code if the corpus is very large -- but then we won't use python. if howmuchcorpus != "all": rawcorpus_list = rawcorpus_list[0:int(howmuchcorpus)] # 0 <= index < corpuslinestoread print("Number of lines read =", len(rawcorpus_list)) for line in rawcorpus_list: this_line = "" breakpoint_list_forline = list() # Clean up data as desired #line = line.lower() line = line.replace('.', ' .').replace('?', ' ?') line_list = line.split() # split line into words if len(line_list) <= 1: continue for word in line_list: # by word in line self.m_NumberOfTrueRunningWords += 1 # build up TrueDictionary if word not in self.m_TrueDictionary: # build up unbroken line self.m_TrueDictionary[word] = 1 # build up breakpoint list else: self.m_TrueDictionary[word] += 1 this_line += word breakpoint_list_forline.append(len(this_line)) #for letter in line: for letter in this_line: # by character in unbroken line if letter not in self.m_EntryDict: # build up dictionary this_entry = LexiconEntry() this_entry.m_Key = letter this_entry.m_Count = 1 #counts occurrences of letter in the corpus self.AddEntry(letter, this_entry) else: self.m_EntryDict[letter].m_Count += 1 self.m_Corpus.append(this_line) # by unbroken line self.m_BreakPointList.append(breakpoint_list_forline) # build up unbroken corpus print("# " + str(len(self.m_TrueDictionary)) + " distinct words in the original corpus.", file=outfile) # Goes into outfile header #----------------------------# # AT THIS POINT WE HAVE # the data: m_Corpus unbroken text # the dictionary: m_EntryDict initial version, entries for single characters # HERE IS THE INITIAL PARSE for line in self.m_Corpus: self.m_ParsedCorpus.append(list(line)) #Parsing is easy! self.m_NumberOfHypothesizedRunningWords = 0 for (key, entry) in self.m_EntryDict.items(): self.m_NumberOfHypothesizedRunningWords += entry.m_Count # PLOGS self.m_CorpusCost = 0.0 self.m_DictionaryCost = 0.0 self.m_SizeOfLongestEntry = 1 self.ComputeDictFrequencies() for key, entry in self.m_EntryDict.items(): this_plog = -1 * math.log(entry.m_Frequency, 2) # applies to both (a)letter uses in data, AND (b)word uses in current parse self.m_LetterPlog[key] = this_plog # record it self.m_DictionaryCost += this_plog # as in (a) self.m_CorpusCost += entry.m_Count * this_plog # as in (b) entry.m_CountRegister.append((0, entry.m_Count, [])) # iteration_number, count, childwords as in UpdateRegister() print("\n Startup") self.Report(0, outfile) infile.close()# ---------------------------------------------------------# def ComputeDictFrequencies(self): TotalCount = 0 for (key, entry) in self.m_EntryDict.items(): TotalCount += entry.m_Count for (key, entry) in self.m_EntryDict.items(): entry.m_Frequency = entry.m_Count/float(TotalCount)# ---------------------------------------------------------## # added july 2015 john# def ComputeDictionaryLength(self):# DictionaryCost = 0# for word in self.m_EntryDict:# wordlength = 0# letters = list(word)# for letter in letters:# wordlength += self.m_LetterPlog[letter]# DictionaryLength += wordlength# self.m_DictionaryCost = DictionaryCost# self.m_DictionaryCostHistory.append(DictionaryCost)## ---------------------------------------------------------# def ParseCorpus(self, current_iteration, outfile): self.m_ParsedCorpus = list() self.m_CorpusCost = 0.0 self.m_NumberOfHypothesizedRunningWords = 0 for word, entry in self.m_EntryDict.items(): entry.m_Count = 0 for line in self.m_Corpus: parsed_line,bit_cost = self.ParseWord(line, outfile) self.m_ParsedCorpus.append(parsed_line) self.m_CorpusCost += bit_cost for word in parsed_line: self.m_EntryDict[word].m_Count +=1 self.m_NumberOfHypothesizedRunningWords += 1# ---------------------------------------------------------# def PrintParsedCorpus(self,outfile): for line in self.m_ParsedCorpus: PrintList(line, outfile)# ---------------------------------------------------------# def ParseWord(self, word, outfile): wordlength = len(word) Parse=dict() Piece = "" LastChunk = "" BestCompressedLength = dict() BestCompressedLength[0] = 0 CompressedSizeFromInnerScanToOuterScan = 0.0 LastChunkStartingPoint = 0 # <------------------ outerscan -----------><------------------> # # ^---starting point # <----prefix?----><----innerscan----------> # <----Piece--------------> if verboseflag: print("\nOuter\tInner", file=outfile) if verboseflag: print("scan:\tscan:\tPiece\tFound?", file=outfile) for outerscan in range(1,wordlength+1): Parse[outerscan] = list() MinimumCompressedSize= 0.0 startingpoint = 0 if outerscan > self.m_SizeOfLongestEntry: startingpoint = outerscan - self.m_SizeOfLongestEntry for innerscan in range(startingpoint, outerscan): if verboseflag: print("\n %3s\t%3s " % (outerscan, innerscan), end=" ", file=outfile) Piece = word[innerscan: outerscan] if verboseflag: print(" %5s"% Piece, end=" ", file=outfile) if Piece in self.m_EntryDict: if verboseflag: print(" %5s" % "Yes.", end=" ", file=outfile) CompressedSizeFromInnerScanToOuterScan = -1 * math.log( self.m_EntryDict[Piece].m_Frequency, 2) newvalue = BestCompressedLength[innerscan] + CompressedSizeFromInnerScanToOuterScan if verboseflag: print(" %7.3f bits" % (newvalue), end=" ", file=outfile) if MinimumCompressedSize == 0.0 or MinimumCompressedSize > newvalue: MinimumCompressedSize = newvalue LastChunk = Piece LastChunkStartingPoint = innerscan if verboseflag: print(" %7.3f bits" % (MinimumCompressedSize), end=" ", file=outfile) else: if verboseflag: print(" %5s" % "No. ", end=" ", file=outfile) BestCompressedLength[outerscan] = MinimumCompressedSize if LastChunkStartingPoint > 0: Parse[outerscan] = list(Parse[LastChunkStartingPoint]) else: Parse[outerscan] = list() if verboseflag: print("\n\t\t\t\t\t\t\t\tchosen:", LastChunk, end=" ", file=outfile) Parse[outerscan].append(LastChunk) if verboseflag: PrintList(Parse[wordlength], outfile) bitcost = BestCompressedLength[outerscan] return (Parse[wordlength],bitcost)# ---------------------------------------------------------# def GenerateCandidates_standardmethod(self, current_iteration, howmany, outfile): # m_ChildWords (for display from the CountRegister at this iteration only) are set in this function for word, lexicon_entry in self.m_EntryDict.items(): lexicon_entry.m_ChildWords = [] CandidateDict = dict() # key is the new word, value is a LexiconEntry object NomineeList = list() for parsed_line in self.m_ParsedCorpus: for wordno in range(len(parsed_line)-1): word0 = parsed_line[wordno] word1 = parsed_line[wordno + 1] candidate = word0 + word1 if candidate in self.m_EntryDict: continue if candidate in CandidateDict: CandidateDict[candidate].m_Count += 1 else: this_entry = LexiconEntry(candidate) this_entry.m_Count = 1 this_entry.m_ParentWords = [word0, word1] CandidateDict[candidate] = this_entry SortableCandidateDict = dict() for candidate, lex_entry in CandidateDict.items(): SortableCandidateDict[candidate] = lex_entry.m_Count EntireCandidateList = sorted(SortableCandidateDict.items(),key=lambda x:x[1], reverse=True) #print("\nIn GenerateCandidates - issue is ties") ThisCount = 0 for candidate, count in EntireCandidateList: #print("candidate = ", candidate) if candidate in self.m_DeletionDict: continue PreviousCount = ThisCount ThisCount = CandidateDict[candidate].m_Count #print("PreviousCount = ", PreviousCount, " ThisCount = ", ThisCount) NomineeList.append((candidate, CandidateDict[candidate])) #print("howmany = ", howmany, " length of NomineeList: ", len(NomineeList)) if len(NomineeList) > howmany: if ThisCount == PreviousCount: howmany += 1 #print("Keep the new nominee; increase howmany to: ", howmany) else: NomineeList = NomineeList[0:-1] #print("Drop the new nominee; shortened NomineeList to: ", len(NomineeList)) #print() break print("Nominees:") latex_data= list() latex_data.append("piece count parent0 parent1 status") for nominee, entry in NomineeList: # for the new word entry.m_CountRegister.append((current_iteration, entry.m_Count, [])) # as in UpdateRegister() self.AddEntry(nominee,entry) # This records the initial count--before parsing. # If the actual count used in the parse differs, # both records will appear for this iteration in the CountRegister. # for the trace for word in entry.m_ParentWords: self.m_EntryDict[word].m_ChildWords.append(nominee) #will go into m_CountRegister to display along with changed counts # for display expr = "" for x in entry.m_ParentWords: expr = expr + "%12s" % (x) print("%15s %10s %-50s" % (nominee, '{:,}'.format(entry.m_Count), expr)) latex_data.append(nominee + "\t" + '{:,} {}'.format(entry.m_Count, expr) ) MakeLatexTable(latex_data,outfile) return NomineeList # NOTE THAT THE RETURN VALUE IS NOT USED # ---------------------------------------------------------## def GenerateCandidates_suffixmethod(self): # Similar to GenerateCandidates() [renamed as GenerateCandidates_standardmethod]## # Take in the suffixes from Linguistica# suffix_infile_name = "Corpus" + str(current_iteration - 1) + "_1_Mini1_Suffixes.txt"# suffix_outfile_name = "Corpus" + str(current_iteration - 1) + ".Suffixes_extracted.txt"# suffixList = this_lexicon.ExtractListFromLing413File(suffix_infile_name, suffix_outfile_name, 0)# suffixList.sort()# #this_lexicon.AddSuffixMark(suffixList) # PUT THIS IN WHEN VERSION HAS m_Suffix# print "Suffix List from", suffix_infile_name# for suffix in suffixList:# print suffix### # Create a dictionary of potential words# SuffixedCandidateDict = dict() # similar to CandidateDict# for parsed_line in self.m_ParsedCorpus:# for wordno in range(len(parsed_line)-1):# word0 = parsed_line[wordno]# word1 = parsed_line[wordno + 1]# # if self.m_EntryDict[word1].m_Suffix == True: # PUT THIS IN WHEN VERSION HAS m_Suffix# if suffixList.count(word1) > 0: # Use this line when running LinkPrevious# #if suffixList.count(word0) > 0: # Use this line when running LinkFollowing# suffixed_candidate = word0 + word1# if suffixed_candidate in self.m_EntryDict:# continue# if suffixed_candidate in SuffixedCandidateDict:# SuffixedCandidateDict[suffixed_candidate].m_ParseCount += 1# else: # add it# suffixed_entry = LexiconEntry(suffixed_candidate)# suffixed_entry.m_ParseCount = 1# suffixed_entry.m_Subwords = [word0, word1]# SuffixedCandidateDict[suffixed_candidate] = suffixed_entry### # Behind the scenes, try a parse with these additional words# # to distinguish false suffixes and exclude unlikely formations# temp_lexicon = copy.deepcopy(self)# for candidate, entry in SuffixedCandidateDict.iteritems():# temp_lexicon.AddEntry(candidate, entry)## temp_lexicon.ComputeDictFrequencies()# temp_lexicon.ParseCorpus (current_iteration, outfile) # (no actual print to outfile occurs)### # Compare counts from before and after the parse (increase is good!)# DeltaByWord = dict()# for key, entry in SuffixedCandidateDict.iteritems():# BeforeParseCount = entry.m_ParseCount# AfterParseCount = temp_lexicon.m_EntryDict[key].m_ParseCount# DeltaByWord[key] = AfterParseCount - BeforeParseCount## WordListOrderedByDelta = sorted(DeltaByWord.iteritems(),key=operator.itemgetter(1), reverse=True)## if True: # OUTPUT COUNTS TO A FILE - CAN BE STUDIED IN EXCEL# i = datetime.datetime.now()# timelabel = "%s_%s_%s.%s%s" % (i.year, i.month, i.day, i.hour, i.minute)# counts_outfilename = "WordDeltaCounts_" + str(current_iteration)+"." + timelabel + ".txt"# counts_outfile = open(counts_outfilename, "w")# print >> counts_outfile, " SuffixedCandidate Before After Subword0 Subword1 Delta"# for key, delta in WordListOrderedByDelta:# expr = ""# for x in SuffixedCandidateDict[key].m_Subwords:# expr = expr + "%12s" % (x)# print >> counts_outfile, "%22s %10s %10s %-30s %5s" % \# (key, \# '{:,}'.format(SuffixedCandidateDict[key].m_ParseCount), # BeforeParseCount \# '{:,}'.format(temp_lexicon.m_EntryDict[key].m_ParseCount), # AfterParseCount \# expr, '{:,}'.format(delta))# counts_outfile.close()### # COLLECT AND OUTPUT DELTA INFO BY SUFFIX# MiddleBandLimit = 1 # SET THIS# if MiddleBandLimit == 0:# NegMidBandLimit = -1# else:# NegMidBandLimit = -1 * MiddleBandLimit## # COLLECT# # The quantities below are accumulated per suffix based on delta values of the words formed on that suffix.# CountPosDeltas = dict() # how many words formed on given suffix have a positive delta# CountNegDeltas = dict()# CountZeroDeltas = dict()## SumPosDeltas = dict() # sum of their delta values# SumNegDeltas = dict()## for suffix in suffixList:# CountPosDeltas[suffix] = 0# CountNegDeltas[suffix] = 0# CountZeroDeltas[suffix] = 0## SumPosDeltas[suffix] = 0# SumNegDeltas[suffix] = 0## for key, delta in WordListOrderedByDelta:# suffix = SuffixedCandidateDict[key].m_Subwords[1]## if delta >= MiddleBandLimit:# CountPosDeltas[suffix] += 1# SumPosDeltas[suffix] += delta# elif delta <= NegMidBandLimit:# CountNegDeltas[suffix] += 1# SumNegDeltas[suffix] += delta# else:# CountZeroDeltas[suffix] +=1## # OUTPUT# # LOOKING FOR USEFUL PATTERNS; MOST LIKELY THIS IS NOT THE FINAL FORM# suffixstats_outfilename = "SuffixDeltaStats_" + str(MiddleBandLimit) + "_" + str(current_iteration) + "." + timelabel + ".txt"# suffixstats_outfile = open(suffixstats_outfilename, "w")# print >> suffixstats_outfile, " Suffix #Pos #Neg #Zero #All SumPos SumNeg DeltaTotal PNRatio ROI"## CountAllDeltas = dict()# DeltaTotal = dict()# PosNegRatio = dict()# ROI = dict()# for suffix in suffixList:# CountAllDeltas[suffix] = CountPosDeltas[suffix] + CountNegDeltas[suffix] + CountZeroDeltas[suffix]# DeltaTotal[suffix] = SumPosDeltas[suffix] + SumNegDeltas[suffix]## if CountAllDeltas[suffix] != 0:# ROI[suffix] = float(DeltaTotal[suffix]) / CountAllDeltas[suffix]# else:# ROI[suffix] = 0## if SumNegDeltas[suffix] != 0:# PosNegRatio[suffix] = float(SumPosDeltas[suffix]) / abs(SumNegDeltas[suffix])# else:# PosNegRatio[suffix] = SumPosDeltas[suffix]# # may also decide to output an average over something or things tbd## print >> suffixstats_outfile, "%8s %6s %6s %6s %6s %6s %6s %9s %9s %8s" % \# (suffix, \# '{:,}'.format(CountPosDeltas[suffix]), \# '{:,}'.format(CountNegDeltas[suffix]), \# '{:,}'.format(CountZeroDeltas[suffix]), \# '{:,}'.format(CountAllDeltas[suffix]), \# '{:,}'.format(SumPosDeltas[suffix]), \# '{:,}'.format(SumNegDeltas[suffix]), \# '{:,}'.format(DeltaTotal[suffix]), \# '{:,.2f}'.format(PosNegRatio[suffix]), \# '{:,.2f}'.format(ROI[suffix]))## suffixstats_outfile.close()#### # SET UP A USEFUL DATA STRUCTURE# FromSuffixToWords = dict() # key - suffix# for suffix in suffixList: # value - list of (word, delta) pairs for that suffix, listed by delta in descending order# FromSuffixToWords[suffix] = []## for key, delta in WordListOrderedByDelta:# this_suffix = SuffixedCandidateDict[key].m_Subwords[1]# FromSuffixToWords[this_suffix].append((key, delta))### # FOR EACH SUFFIX, SET A THRESHOLD BASED ON DELTA PROPERTIES FOR WHICH OF ITS# # ASSOCIATED WORDS SHOULD GO INTO THE LEXICON# Cutoff = dict() # Key: proposed suffix# # Value: minimum delta for a word in its list to qualify for the lexicon## # These parameters must be set; so far they look OK for English.# DeltaTotalLimit = 1# PosNegRatioLimit = 1.9# ROILimit = .1# DefaultWordDeltaLimit = 1 # This is the standard expectation for words formed on a real suffix.## for suffix in FromSuffixToWords:# if DeltaTotal[suffix] < DeltaTotalLimit: # Clearly a false suffix! Don't admit any word from its list.# Cutoff[suffix] = sys.maxsize## elif PosNegRatio[suffix] < PosNegRatioLimit: # May or may not be a real suffix, but likely that many items# Cutoff[suffix] = sys.maxsize # in its list are not real stem+suffix formations.## elif ROI[suffix] < ROILimit: # similar to preceding case# Cutoff[suffix] = sys.maxsize## elif (CountPosDeltas[suffix] < CountNegDeltas[suffix] and \# CountNegDeltas[suffix] < CountZeroDeltas[suffix]): # Probably a real suffix, but this property indicates# Cutoff[suffix] = DefaultWordDeltaLimit + 1 # that more bad formations occur.## else:# Cutoff[suffix] = DefaultWordDeltaLimit # A real, well-behaved suffix!## if True:# print >> outfile, "Cutoff values for suffixed words:"# for suffix in suffixList:# print >> outfile, "%8s %8s" % \# (suffix, '{:,}'.format(Cutoff[suffix]))### # THIS IS THE GOAL OF ALL THE PRECEDING WORK# # Add qualified suffixed candidates to the lexicon# NomineeList = list()# latex_data= list()# latex_data.append("piece count subword subword")## for suffix in FromSuffixToWords:# for (word, delta) in FromSuffixToWords[suffix]:# if delta >= Cutoff[suffix]:# admitted_entry = self.AddEntry(word, temp_lexicon.m_EntryDict[word]) # Use expected count for this initial, pre-parse entry# NomineeList.append((word, admitted_entry))## # for the trace# admitted_entry.m_CountRegister.append((current_iteration, admitted_entry.m_ParseCount, 0, [])) # as in UpdateRegister()# for subword in admitted_entry.m_Subwords:# self.m_EntryDict[subword].m_ReprCount += 1# self.m_EntryDict[subword].m_Extwords.append(word)# self.m_EntryDict[subword].m_NewExtwords.append(word) #will go into this word's m_CountRegister to display along with changed counts## # for display# expr = ""# for subword in admitted_entry.m_Subwords:# expr = expr + "%12s" % (subword)# print "%22s %8s %8s %-50s" % (word, '{:,}'.format(delta), '{:,}'.format(admitted_entry.m_ParseCount), expr)# latex_data.append(word + "\t" + '{:,} {}'.format(admitted_entry.m_ParseCount, expr) )# # DESIRABLE TO DISPLAY DELTA -- NEED HELP MAKING IT WORK!## else:# break # deltas are descending, so go on to the next suffix## MakeLatexTable(latex_data,outfile)## return NomineeList # NOTE THAT THE RETURN VALUE IS NOT USED # July 18, 2015 - Now used to update DictionaryCost### ---------------------------------------------------------# def Expectation(self): self.m_NumberOfHypothesizedRunningWords = 0 for this_line in self.m_Corpus: wordlength = len(this_line) ForwardProb = dict() BackwardProb = dict() Forward(this_line,ForwardProb) Backward(this_line,BackwardProb) this_word_prob = BackwardProb[0] if WordProb > 0: for nPos in range(wordlength): for End in range(nPos, wordlength-1): if End- nPos + 1 > self.m_SizeOfLongestEntry: continue if nPos == 0 and End == wordlength - 1: continue Piece = this_line[nPos, End+1] if Piece in self.m_EntryDict: this_entry = self.m_EntryDict[Piece] CurrentIncrement = ((ForwardProb[nPos] * BackwardProb[End+1])* this_entry.m_Frequency ) / WordProb this_entry.m_Count += CurrentIncrement self.m_NumberOfHypothesizedRunningWords += CurrentIncrement# ---------------------------------------------------------# def Maximization(self): for entry in self.m_EntryDict: entry.m_Frequency = entry.m_Count / self.m_NumberOfHypothesizedRunningWords# ---------------------------------------------------------# def Forward (self, this_line,ForwardProb): ForwardProb[0]=1.0 for Pos in range(1,Length+1): ForwardProb[Pos] = 0.0 if (Pos - i > self.m_SizeOfLongestEntry): break Piece = this_line[i,Pos+1] if Piece in self.m_EntryDict: this_Entry = self.m_EntryDict[Piece] vlProduct = ForwardProb[i] * this_Entry.m_Frequency ForwardProb[Pos] = ForwardProb[Pos] + vlProduct return ForwardProb# ---------------------------------------------------------# def Backward(self, this_line,BackwardProb): Last = len(this_line) -1 BackwardProb[Last+1] = 1.0 for Pos in range( Last, Pos >= 0,-1): BackwardProb[Pos] = 0 for i in range(Pos, i <= Last,-1): if i-Pos +1 > m_SizeOfLongestEntry: Piece = this_line[Pos, i+1] if Piece in self.m_EntryDict[Piece]: this_Entry = self.m_EntryDict[Piece] if this_Entry.m_Frequency == 0.0: continue vlProduct = BackwardProb[i+1] * this_Entry.m_Frequency BackwardProb[Pos] += vlProduct return BackwardProb# ---------------------------------------------------------# def PrintLexicon(self, outfile): print("\n\nLEXICON with trace information", file=outfile) for key in sorted(self.m_EntryDict.keys()): self.m_EntryDict[key].Display(outfile) print("\n\nDELETIONS", file=outfile) for key in self.m_DeletionDict: self.m_DeletionDict[key].Display(outfile)# ---------------------------------------------------------# def RecallPrecision(self, iteration_number, outfile): # the following calculations are precision and recall *for breaks* (not for morphemes) total_true_positive_for_break = 0 for linenumber in range(len(self.m_BreakPointList)): truth = self.m_BreakPointList[linenumber] if len(truth) < 2: #NOTE There are 10 such lines in brown-english.txt print("Skipping this line:", self.m_Corpus[linenumber], file=outfile) continue hypothesis = list() hypothesis_line_length = 0 for piece in self.m_ParsedCorpus[linenumber]: # NOTE audrey 2015_08_23 hypothesis_line_length += len(piece) # This could be constructed in ParseCorpus() and stored in Lexicon. hypothesis.append(hypothesis_line_length) # Analogous to m_BreakPointList (and m_NumberOfTrueRunningWords). #number_of_hypothesized_words = len(hypothesis) # (We already store m_NumberOfHypothesizedRunningWords.) true_positive_for_break = len(set(hypothesis).intersection(set(truth))) #if linenumber == 86: #print("For linenumber ", linenumber, " len(truth) =", len(truth), "BUT true_positive_for_break =", true_positive_for_break) #print("truth is ", truth) #print("hypothesis is ", hypothesis) total_true_positive_for_break += true_positive_for_break total_break_precision = float(total_true_positive_for_break) / self.m_NumberOfHypothesizedRunningWords total_break_recall = float(total_true_positive_for_break) / self.m_NumberOfTrueRunningWords self.m_Break_based_RecallPrecisionHistory.append((iteration_number, total_break_precision,total_break_recall)) formatstring = "%16s %12s %6.4f %9s %6.4f" print() print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall)) print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall), file=outfile) # Token_based precision for word discovery: if (True): true_positives = 0 for (word, this_words_entry) in self.m_EntryDict.items(): if word in self.m_TrueDictionary: true_count = self.m_TrueDictionary[word] these_true_positives = min(true_count, this_words_entry.m_Count) else: these_true_positives = 0 true_positives += these_true_positives word_recall = float(true_positives) / self.m_NumberOfTrueRunningWords word_precision = float(true_positives) / self.m_NumberOfHypothesizedRunningWords self.m_Token_based_RecallPrecisionHistory.append((iteration_number, word_precision,word_recall)) print(formatstring %( "Token_based word", "precision", word_precision, "recall", word_recall), file=outfile) print(formatstring %( "Token_based word", "precision", word_precision, "recall", word_recall)) # Type_based precision for word discovery: if (True): true_positives = 0 for (word, this_words_entry) in self.m_EntryDict.items(): if word in self.m_TrueDictionary: true_positives +=1 word_recall = float(true_positives) / len(self.m_TrueDictionary) word_precision = float(true_positives) / len(self.m_EntryDict) self.m_Type_based_RecallPrecisionHistory.append((iteration_number, word_precision,word_recall)) #print >>outfile, "\n\n***\n" #print "Type_based Word Precision %6.4f; Word Recall %6.4f" %(word_precision ,word_recall) print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall), file=outfile) print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall))# ---------------------------------------------------------# def RecallPrecision_do_not_use(self, iteration_number, outfile): # break-based calculation is simplified in version above print("Now at top of RecallPrecision function") print("linenumber = ", iteration_number) #just to get a few specific lines - has nothing to do with iteration itself print("Here is self.m_BreakPointList[linenumber] ", self.m_BreakPointList[iteration_number]) print("Its length is ", len(self.m_BreakPointList[iteration_number])) truth = list(self.m_BreakPointList[iteration_number]) print("Here is truth ", truth) print("Length of truth is ", len(truth)) print("\n\n") total_true_positive_for_break = 0 total_number_of_hypothesized_words = 0 total_number_of_true_words = 0 for linenumber in range(len(self.m_BreakPointList)): truth = list(self.m_BreakPointList[linenumber]) if len(truth) < 2: print("Skipping this line:", self.m_Corpus[linenumber], file=outfile) continue #number_of_true_words = len(truth) -1 #No--For the beginning line, for example, there are 23 words, counting '.' as a separate word. And len(truth) = 23. number_of_true_words = len(truth) hypothesis = list() hypothesis_line_length = 0 accurate_word_discovery = 0 true_positive_for_break = 0 word_too_big = 0 word_too_small = 0 real_word_lag = 0 hypothesis_word_lag = 0 for piece in self.m_ParsedCorpus[linenumber]: # NOTE audrey 2015_08_23 hypothesis_line_length += len(piece) # This could be constructed in ParseCorpus() and stored in Lexicon. hypothesis.append(hypothesis_line_length) # We already store m_NumberOfHypothesizedRunningWords. number_of_hypothesized_words = len(hypothesis) # Analogous to m_BreakPointList and m_NumberOfTrueRunningWords. if linenumber == iteration_number: print("hypothesis = ", hypothesis) # state 0: at the last test, the two parses were in agreement # state 1: at the last test, truth was # and hypothesis was not # state 2: at the last test, hypothesis was # and truth was not pointer = 0 state = 0 while (len(truth) > 0 and len(hypothesis) > 0): next_truth = truth[0] next_hypothesis = hypothesis[0] if state == 0: real_word_lag = 0 hypothesis_word_lag = 0 if next_truth == next_hypothesis: pointer = truth.pop(0) hypothesis.pop(0) true_positive_for_break += 1 accurate_word_discovery += 1 state = 0 elif next_truth < next_hypothesis: pointer = truth.pop(0) real_word_lag += 1 state = 1 else: #next_hypothesis < next_truth: pointer = hypothesis.pop(0) hypothesis_word_lag = 1 state = 2 elif state == 1: if next_truth == next_hypothesis: pointer = truth.pop(0) hypothesis.pop(0) true_positive_for_break += 1 word_too_big += 1 state = 0 elif next_truth < next_hypothesis: pointer = truth.pop(0) real_word_lag += 1 state = 1 #redundantly else: pointer = hypothesis.pop(0) hypothesis_word_lag += 1 state = 2 else: #state = 2 if next_truth == next_hypothesis: pointer = truth.pop(0) hypothesis.pop(0) true_positive_for_break += 1 word_too_small +=1 state = 0 elif next_truth < next_hypothesis: pointer = truth.pop(0) real_word_lag += 1 state = 1 else: pointer = hypothesis.pop(0) hypothesis_word_lag += 1 state =2 #precision = float(true_positive_for_break) / number_of_hypothesized_words #recall = float(true_positive_for_break) / number_of_true_words total_true_positive_for_break += true_positive_for_break if linenumber == iteration_number: print("true_positive_for_break = ", true_positive_for_break) total_number_of_hypothesized_words += number_of_hypothesized_words total_number_of_true_words += number_of_true_words print("Info_RecallPrecision") print("iteration_number = ", iteration_number) print("total_number_of_true_words = ", total_number_of_true_words) print("self.m_NumberOfTrueRunningWords = ", self.m_NumberOfTrueRunningWords) print("total_number_of_hypothesized_words = ", total_number_of_hypothesized_words) print("self.m_NumberOfHypothesizedRunningWords = ", self.m_NumberOfHypothesizedRunningWords) # the following calculations are precision and recall *for breaks* (not for morphemes) formatstring = "%16s %12s %6.4f %9s %6.4f" total_break_precision = float(total_true_positive_for_break) / total_number_of_hypothesized_words total_break_recall = float(total_true_positive_for_break) / total_number_of_true_words ### WHY HERE ???? #self.m_CorpusCostHistory.append( self.m_CorpusCost) MOVED TO Report() August 21, 2015 self.m_Break_based_RecallPrecisionHistory.append((iteration_number, total_break_precision,total_break_recall)) print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall)) print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall), file=outfile) # Token_based precision for word discovery: if (True): true_positives = 0 for (word, this_words_entry) in self.m_EntryDict.items(): if word in self.m_TrueDictionary: true_count = self.m_TrueDictionary[word] these_true_positives = min(true_count, this_words_entry.m_Count) else: these_true_positives = 0 true_positives += these_true_positives word_recall = float(true_positives) / self.m_NumberOfTrueRunningWords word_precision = float(true_positives) / self.m_NumberOfHypothesizedRunningWords self.m_Token_based_RecallPrecisionHistory.append((iteration_number, word_precision,word_recall)) print(formatstring %( "Token_based Word Precision", word_precision, "recall", word_recall), file=outfile) print(formatstring %( "Token_based Word Precision", word_precision, "recall", word_recall)) # Type_based precision for word discovery: if (True): true_positives = 0 for (word, this_words_entry) in self.m_EntryDict.items(): if word in self.m_TrueDictionary: true_positives +=1 word_recall = float(true_positives) / len(self.m_TrueDictionary) word_precision = float(true_positives) / len(self.m_EntryDict) self.m_Type_based_RecallPrecisionHistory.append((iteration_number, word_precision,word_recall)) #print >>outfile, "\n\n***\n"# print "Type_based Word Precision %6.4f; Word Recall %6.4f" %(word_precision ,word_recall) print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall), file=outfile) print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall))# ---------------------------------------------------------# def PrintRecallPrecision(self, itarget, outfile): print("\t\t\tBreak\t\tToken-based\t\tType-based", file=outfile) print("\t\t\tprecision\trecall\tprecision\trecall\tprecision\trecall", file=outfile) for iterno in range(itarget+1): #was (range(numberofcycles-1)): #for iterno in range(1, numberofcycles+1): #for the 3 histories, may not be consistent whether start at 0 or 1 print("printing iterno", iterno) (iteration, p1,r1) = self.m_Break_based_RecallPrecisionHistory[iterno] (iteration, p2,r2) = self.m_Token_based_RecallPrecisionHistory[iterno] (iteration, p3,r3) = self.m_Type_based_RecallPrecisionHistory[iterno] cost1 = self.m_DictionaryCostHistory[iterno] # was int(self.m_DictionaryCostHistory[iterno]) cost2 = self.m_CorpusCostHistory[iterno] # was int(self.m_CorpusCostHistory[iterno] ) #print >>outfile,"%3i\t%8.3f\t%8.3f\t%8.3f\t%8.3f\t%8.3f\t%8.3f" %(iteration, r1,p1,r2,p2,r3,p3) print(iteration,"\t",cost1, "\t", cost2, "\t", p1,"\t",r1,"\t",p2,"\t",r2,"\t",p3,"\t",r3, file=outfile)# ---------------------------------------------------------# def ExtendLexicon(self, method, iteration_number, howmany, outfile): #TempDeletionList = self.FilterZeroCountEntries(iteration_number) # DON"T NEED TempDeletionList if method == "standard": NomineeList = self.GenerateCandidates_standardmethod(iteration_number, howmany, outfile) #elif method == "suffix": # if no method specified, no new candidates will be generated #NomineeList = self.GenerateCandidates_suffixmethod() self.FilterZeroCountEntries(outfile) # MOVED DOWNWARD. self.ComputeDictFrequencies() # uses Count info from previous parse and GenerateCandidates() # COMPUTE DICTIONARY COST - composition-based #self.m_LexiconCost = self.m_InitialLexCost #for key, entry in self.m_EntryDict.items(): #self.m_LexiconCost += entry.m_ReprCount * -1 * math.log(entry.m_Frequency, 2) # COMPUTE DICTIONARY COST - letter-based #print "Now make sure m_LetterPlog is OK" #for key, entry in self.m_LetterPlog.items(): # print "key = ", key, " plog = ", self.m_LetterPlog[key] #print "Now in ExtendLexicon: Compute the dictionary cost letter-based" # USE THE FUNCTION INSTEAD? self.m_DictionaryCost = 0.0 for word in self.m_EntryDict: #print "word = ", word letters = list(word) for letter in letters: self.m_DictionaryCost += self.m_LetterPlog[letter] # UPDATE DICTIONARY COST July 19, 2015: For iters 20, 21, 22, the whole-dictionary calculation is faster #for word in TempDeletionList: # letters = list(word) # for letter in letters: # self.m_DictionaryCost -= self.m_LetterPlog[letter] #for (word, entry) in NomineeList: # letters = list(word) # for letter in letters: # self.m_DictionaryCost += self.m_LetterPlog[letter] #self.m_DictionaryCostHistory.append(self.m_DictionaryCost) #moved to Report()# ---------------------------------------------------------# def Report(self, iteration_number, outfile): # DictionaryCost is calculated in ExtendLexicon(), CorpusCost in ParseCorpus(). print("Cost: ") print("-%16s" % 'Corpus: ', "{0:18,.4f}".format(self.m_CorpusCost)) print("-%16s" % 'Dictionary: ', "{0:18,.4f}".format(self.m_DictionaryCost)) print("-%16s" % 'Combined: ', "{0:18,.4f}".format(self.m_CorpusCost + self.m_DictionaryCost)) print("Cost: ", file=outfile) print("-%16s" % 'Corpus: ', "{0:18,.4f}".format(self.m_CorpusCost), file=outfile) print("-%16s" % 'Dictionary: ', "{0:18,.4f}".format(self.m_DictionaryCost), file=outfile) print("-%16s" % 'Combined: ', "{0:18,.4f}".format(self.m_CorpusCost + self.m_DictionaryCost), file=outfile) self.m_CorpusCostHistory.append(self.m_CorpusCost) self.m_DictionaryCostHistory.append(self.m_DictionaryCost) self.RecallPrecision(iteration_number, outfile) self.UpdateLexEntryRegisters(iteration_number)# ---------------------------------------------------------#def PrintList(my_list, outfile): print(file=outfile) for item in my_list: print(item, end=" ", file=outfile)# ---------------------------------------------------------#def LoadSavedStateFromFile(statefile, ibase, itarget): if not (itarget > ibase): print("\nERROR") print("itarget must be an integer greater than ibase.") print("Exiting...\n") sys.exit() if (not os.path.isfile(statefile)): print("\nERROR") print("The designated statefile, ", statefile, ", does not exist.") print("Exiting...\n") sys.exit() fd = open(statefile, "r") lines = fd.readlines() fd.close() for line in lines: if "Last iteration" in line: #this is 'in' operator items = line.split('=') iend = int(items[1]) #print("Found Last iteration = ", iend) if ibase != iend: print("\nERROR") print("Mismatch between ibase iteration =", ibase, "and last recorded iteration =", iend, "on statefile") #print(statefile, "has state information for iteration", str(iend)+",") #print("not for ibase iteration", ibase, "as required.") print("Exiting...\n") sys.exit() break print("LOADING SAVED STATE...") serialstr = lines[-1] this_lexicon = jsonpickle.decode(serialstr) return this_lexicondef main(language, corpus, datafolder, ibase, itarget, statefile, candidatesperiteration, howmuchcorpus, verboseflag): # REQUIRED ARGUMENTS if (ibase == None or itarget == None): print("\nERROR") print("ibase and itarget must be specified, with itarget > ibase >= 0") print("Exiting...\n") sys.exit() if (ibase > 0 and statefile == None): print("\nERROR") print("To resume processing beyond the ibase iteration, a file specifying the previous state must be provided.") print("Use the '--statefile = ' command line option.") print("Exiting...\n") sys.exit() numberofcycles = itarget - ibase # for convenience # SET UP OUTPUT FILES i = datetime.datetime.now() #timelabel = "%s_%s_%s.%s_%s" % (i.year, i.month, i.day, i.hour, i.minute) timelabel = i.strftime("%Y_%m_%d.%H_%M") iterlabel = "[%s,%s]" % (ibase+1, itarget) longlabel = "wb-%s-%s" % (timelabel, iterlabel) shortlabel = "wb-%s" % (iterlabel) outfolder = Path(datafolder, language, "wordbreaking", longlabel) if not outfolder.exists(): outfolder.mkdir(parents=True) outfile_pathname = str(Path(outfolder, shortlabel + ".txt")) outfile_corpus_pathname = str(Path(outfolder, shortlabel + "_brokencorpus.txt")) outfile_lexicon_pathname = str(Path(outfolder, shortlabel + "_lexicon_.txt")) outfile_recallprecision_pathname = str(Path(outfolder, shortlabel + "_recallprecision.tsv")) outfile_jsonpickle_pathname = str(Path(outfolder, longlabel + "_jsonpickle.txt")) outfile = open(outfile_pathname, "w") outfile_corpus = open(outfile_corpus_pathname, "w") outfile_lexicon = open(outfile_lexicon_pathname, "w") outfile_recallprecision = open(outfile_recallprecision_pathname, "w") outfile_jsonpickle = open(outfile_jsonpickle_pathname, "w") # Header for outfile print("# corpus = " + str(corpus), file=outfile) print("# state loaded from " + str(statefile), file=outfile) print("# state saved to " + longlabel + "_jsonpickle.txt", file=outfile) print("# " + str(numberofcycles) + " cycles, ", "(first = " + str(ibase + 1) + ", last = " + str(itarget) + ")", file=outfile) print("# " + str(candidatesperiteration) + " candidates on each cycle.", file=outfile) print("# lines read from original corpus = ", howmuchcorpus, file=outfile) # Header for jsonpickle outfile print("# Corpus = " + str(corpus), file=outfile_jsonpickle) print("# Date = " + i.strftime("%Y_%m_%d"), file=outfile_jsonpickle) print("# Time = " + i.strftime("%H_%M"), file=outfile_jsonpickle) print("# First iteration = " + str(ibase+1), file=outfile_jsonpickle) print("# Last iteration = " + str(itarget), file=outfile_jsonpickle) print("# Notes = working on reacting to load state errors",file=outfile_jsonpickle) print(file=outfile_jsonpickle) infile_corpus_pathname = str(Path(datafolder, language, corpus)) # INITIAL PROCESSING if False: if prev_iteration_number == 0: this_lexicon.ReadBrokenCorpus (corpusfilename, corpuslinestoread) else: print("LOADING SAVED STATE...") this_lexicon.LoadState(prev_iteration_number) # Oops if (ibase == 0 and statefile == None): this_lexicon = Lexicon() this_lexicon.ReadBrokenCorpus (infile_corpus_pathname, outfile, howmuchcorpus) #this_lexicon.ParseCorpus (outfile, current_iteration) # CHANGE NEEDED WHEN LOAD SAVED STATE #this_lexicon.ParseCorpus (outfile, 0) # CHANGE NEEDED WHEN LOAD SAVED STATE else: this_lexicon = LoadSavedStateFromFile(statefile, ibase, itarget) # ITERATIVE PROCESSING t_start = time.time() # 2014_07_20 #2015_07_18 TEMPORARY for current_iteration in range(1+ibase, 1+itarget): print("\n\n Iteration number " + str(current_iteration) + " (#" + str(current_iteration-ibase) + " of " + str(numberofcycles) + " for this run)") print("\n\n Iteration number", current_iteration, file=outfile) method = "standard" #FOR NOW this_lexicon.ExtendLexicon(method, current_iteration, candidatesperiteration, outfile) this_lexicon.ParseCorpus (current_iteration, outfile) this_lexicon.Report(current_iteration, outfile) # COMPLETION t_end = time.time() elapsed = t_end - t_start print ("\n\nElapsed wall time in seconds = ", elapsed) print ("\n\nElapsed wall time in seconds = ", elapsed, file=outfile) this_lexicon.PrintParsedCorpus(outfile_corpus) this_lexicon.PrintLexicon(outfile_lexicon) this_lexicon.PrintRecallPrecision(itarget, outfile_recallprecision) print("\nSAVING CURRENT STATE to '" + longlabel + "_jsonpickle.txt'") print("[Processing can be continued from this point in a later run ") print("by loading this file through the '--statefile = ' command line option.]") print() serialstr = jsonpickle.encode(this_lexicon) print(serialstr, file=outfile_jsonpickle) outfile.close() outfile_corpus.close() outfile_lexicon.close() outfile_recallprecision.close() outfile_jsonpickle.close()if __name__ == "__main__": args = makeArgParser().parse_args() ibase = args.ibase itarget = args.itarget statefile = args.statefile candidatesperiteration = args.candidates corpuslinestoread = args.corpuslinestoread verboseflag = args.verbose if corpuslinestoread == sys.maxsize: howmuchcorpus = "all" else: howmuchcorpus = str(corpuslinestoread) description="You are running {}.\n".format(__file__) + \ "This program segments unbroken text into words.\n" + \ "\tibase = {}\n".format(ibase) + \ "\titarget = {}\n".format(itarget) + \ "\tstatefile = {}\n".format(statefile) + \ "\tcandidatesperiteration = {}\n".format(candidatesperiteration) + \ "\tcorpuslinestoread = {}\n".format(howmuchcorpus) + \ "\t_verbose = {}".format(verboseflag) language, corpus, datafolder = get_language_corpus_datafolder(args.language, args.corpus, args.datafolder, args.config, description=description, scriptname=__file__) main(language, corpus, datafolder, ibase, itarget, statefile, candidatesperiteration, howmuchcorpus, verboseflag)