Add example scripts and documentation.

eginhard · Jun 22, 2017 · 2b56312 · 2b56312
1 parent e3844f8
commit 2b56312
Show file tree

Hide file tree

Showing 10 changed files with 100,144 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,33 @@
-# word-level-language-id
-Simple word-level language ID using Viterbi based on unigram frequencies and character n-grams.
+# Word-level language ID
+Simple word-level language identification using the Viterbi algorithm based on unigram frequencies and character n-grams.
+
+## Usage
+
+I recommend using Python 3 for better Unicode support.
+
+To quickly try out the system, corpora and language models are already included for British English and Irish. See below how to add new ones. You might want to do some post-processing on the lexicons because e.g. the Irish one contains some English as well and vice versa.
+
+Run word-level language ID on some example sentences:
+
+```bash
+python word-level-language-id/identify.py
+```
+
+## Train new language models
+
+Create or download a unigram frequency lexicon, e.g. from the [Crúbadán Project](http://crubadan.org/) which has those readily available for over 2000 languages.
+
+For example, download and unzip British English and Irish:
+
+```bash
+wget http://crubadan.org/files/en-GB.zip 
+wget http://crubadan.org/files/ga.zip
+
+unzip '*.zip' -d word-level-language-id/corpora
+```
+
+Train the language models.
+
+```bash
+python word-level-language-id/train.py
+```
diff --git a/LanguageIdentifier.py → word-level-language-id/LanguageIdentifier.py b/LanguageIdentifier.py → word-level-language-id/LanguageIdentifier.py
@@ -1,36 +1,36 @@
 # -*- coding: utf-8 -*-
 
-import json
 import math
-import sys
 
 from LanguageModel import LanguageModel
 
 class LanguageIdentifier:
     """Word level language identification.
     """
 
-    GA = "ga"
+    FR = "ga"
     EN = "en"
 
     # This string should be used for tokens that don't need a language
     # assignment, e.g. punctuation.
     IGNORE = "##IGNORE##"
 
     def __init__(self,
-                 model_file_ga, model_file_en,
-                 lex_file_ga, lex_file_en,
+                 fr, en,
+                 model_file_fr, model_file_en,
+                 lex_file_fr, lex_file_en,
                  lex_weight=1):
         """Initialises the language model.
 
         Args:
-            model_file_ga/en (str): Irish/English LanguageModel file name
-            lex_file_ga/en (str): Irish/English Lexicon (1 word + frequency per line)
+            fr/en: Foreign/English language code.
+            model_file_fr/en (str): Foreign/English LanguageModel file name.
+            lex_file_fr/en (str): Foreign/English Lexicon (1 word + frequency per line).
             lex_weight (float): Weight of the lexicon vs. the character model.
         """
         self.lex_weight = lex_weight
         self.model = {}
-        self.model[self.GA] = LanguageModel.load(model_file_ga, lex_file_ga, lex_weight)
+        self.model[self.FR] = LanguageModel.load(model_file_fr, lex_file_fr, lex_weight)
         self.model[self.EN] = LanguageModel.load(model_file_en, lex_file_en, lex_weight)
 
     def identify(self, tokens,
@@ -47,16 +47,16 @@ def identify(self, tokens,
             transition_probability (float):
                 Probability that the following token will be in the same language.
             start_probability (float):
-                Probability that the first token will be Irish
+                Probability that the first token will be Foreign
 
         Returns:
             languages (str[]): List of language assignments matching the token list.
-                               E.g. ["ga", "ga", "en", ...]
+                               E.g. ["fr", "fr", "en", ...]
         """
 
         # Special treatment for the Irish affirmative "sea" in 1-word sentences.
-        if len(tokens) == 1 and tokens[0].lower() == "sea":
-            return [self.GA]
+        if self.FR == 'ga' and len(tokens) == 1 and tokens[0].lower() == "sea":
+            return [self.FR]
 
         if method == "independent":
             return self.identify_independent(tokens)
@@ -75,8 +75,8 @@ def identify_independent(self, tokens):
         languages = []
         for token in tokens:
             scores = self.score(token)
-            if scores[self.GA] >= scores[self.EN]:
-                languages.append(self.GA)
+            if scores[self.FR] >= scores[self.EN]:
+                languages.append(self.FR)
             else:
                 languages.append(self.EN)
 
@@ -98,19 +98,19 @@ def identify_viterbi(self, tokens,
 
         # Probability of keeping vs. switching the language
         trans_p = {}
-        trans_p[self.GA] = {}
+        trans_p[self.FR] = {}
         trans_p[self.EN] = {}
-        trans_p[self.GA][self.GA] = transition_probability
+        trans_p[self.FR][self.FR] = transition_probability
         trans_p[self.EN][self.EN] = transition_probability
-        trans_p[self.GA][self.EN] = 1- transition_probability
-        trans_p[self.EN][self.GA] = 1- transition_probability
+        trans_p[self.FR][self.EN] = 1 - transition_probability
+        trans_p[self.EN][self.FR] = 1 - transition_probability
 
         # Initial probabilities for both languages
         scores = self.score(tokens[0])
-        V[0][self.GA] = math.log(start_probability) + scores[self.GA]
+        V[0][self.FR] = math.log(start_probability) + scores[self.FR]
         V[0][self.EN] = math.log(1 - start_probability) + scores[self.EN]
 
-        langs = [self.GA, self.EN]
+        langs = [self.FR, self.EN]
 
         # Iterate over tokens (starting at second token)
         for t in range(1, len(tokens)):
@@ -146,36 +146,36 @@ def score(self, word):
         """
         # Punctuation etc. have no influence on the language assignment
         if word == self.IGNORE:
-            return {self.GA: 1, self.EN: 1}
+            return {self.FR: 1, self.EN: 1}
 
         lex_score, char_score = {}, {}
-        for lang in [self.GA, self.EN]:
+        for lang in [self.FR, self.EN]:
             lex_score[lang] = math.exp(self.model[lang].lex_score(word))
             char_score[lang] = math.exp(self.model[lang].char_score(word))
 
         # Relative scores, only these can be weighted
         lex_score_rel, char_score_rel = {}, {}
-        for lang in [self.GA, self.EN]:
-            lex_score_rel[lang] = lex_score[lang] / (lex_score[self.GA] +
+        for lang in [self.FR, self.EN]:
+            lex_score_rel[lang] = lex_score[lang] / (lex_score[self.FR] +
                                                      lex_score[self.EN])
-            char_score_rel[lang] = char_score[lang] / (char_score[self.GA] +
+            char_score_rel[lang] = char_score[lang] / (char_score[self.FR] +
                                                        char_score[self.EN])
 
         weighted_score = {}
         # If neither word is in the lexicon, use only the character model
-        if (lex_score[self.GA] == math.exp(self.model[self.GA].lex_score(LanguageModel.OOV)) and
+        if (lex_score[self.FR] == math.exp(self.model[self.FR].lex_score(LanguageModel.OOV)) and
             lex_score[self.EN] == math.exp(self.model[self.EN].lex_score(LanguageModel.OOV))):
-            for lang in [self.GA, self.EN]:
+            for lang in [self.FR, self.EN]:
                 weighted_score[lang] = math.log(char_score_rel[lang])
         # Else combine both models
         else:
-            for lang in [self.GA, self.EN]:
+            for lang in [self.FR, self.EN]:
                 weighted_score[lang] = math.log(self.lex_weight * lex_score_rel[lang] +
                                                 (1 - self.lex_weight) * char_score_rel[lang])
         #print word
-        #print("%.15f %.15f" % (lex_score[self.GA], lex_score[self.EN]))
-        #print("%.15f %.15f" % (char_score[self.GA], char_score[self.EN]))
-        #print("%.15f %.15f" % (lex_score_rel[self.GA], lex_score_rel[self.EN]))
-        #print("%.15f %.15f" % (char_score_rel[self.GA], char_score_rel[self.EN]))
-        #print("%.15f %.15f" % (weighted_score[self.GA], weighted_score[self.EN]))
+        #print("%.15f %.15f" % (lex_score[self.FR], lex_score[self.EN]))
+        #print("%.15f %.15f" % (char_score[self.FR], char_score[self.EN]))
+        #print("%.15f %.15f" % (lex_score_rel[self.FR], lex_score_rel[self.EN]))
+        #print("%.15f %.15f" % (char_score_rel[self.FR], char_score_rel[self.EN]))
+        #print("%.15f %.15f" % (weighted_score[self.FR], weighted_score[self.EN]))
         return weighted_score
diff --git a/LanguageModel.py → word-level-language-id/LanguageModel.py b/LanguageModel.py → word-level-language-id/LanguageModel.py
@@ -1,13 +1,11 @@
 # -*- coding: utf-8 -*-
 
+import codecs
 import json
 import math
-import sys
 
 class LanguageModel:
     """Language model based on a lexicon and a character n-gram Markov model.
-
-    Description...
     """
 
     # Symbols to mark start and end of a sequence.
@@ -26,7 +24,7 @@ class LanguageModel:
     # but "Is" will rather start an Irish and "A" an English one.
     CASE_SENSITIVITY_THRESHOLD = 4
 
-    def __init__(self, language, n, lex_file, lex_weight):
+    def __init__(self, language, n, lex_file, lex_weight=1):
         """Initialises the language model.
 
         Args:
@@ -54,20 +52,21 @@ def dump(self, file_name=None):
         """Saves the language model to the specified file in JSON format"""
         if file_name is None:
             file_name = self.language + ".model"
-        with open(file_name, "wb") as f:
+        with open(file_name, "w") as f:
             json.dump([self.language, self.n,
                        self.start_prob, self.trans_prob], f)
+        print("Saved model at: %s" % file_name)
 
     @classmethod
-    def load(cls, model_file, lex_file, lex_weight):
+    def load(cls, model_file, lex_file, lex_weight=1):
         """Loads the language model from the specified file
 
         Args:
             model_file (str): LanguageModel file name
             lex_file (str): Frequency lexicon file name
             lex_weight (float): Weight of the lexicon vs. the character model.
         """
-        with open(model_file, "rb") as f:
+        with open(model_file) as f:
             language, n, start_prob, trans_prob = json.load(f)
             model = cls(language, n, lex_file, lex_weight)
             model.start_prob = start_prob
@@ -89,11 +88,10 @@ def load_lexicon(self, lex_file):
         """
         lamb = 0.1 # smoothing value
 
-        with open(lex_file) as f:
+        with codecs.open(lex_file, encoding='utf-8') as f:
             lex = {}
             total = 0
             for line in f:
-                line = line.decode("utf-8")
                 fields = line.strip().split()
                 word = fields[0]
                 if len(fields[0]) >= self.CASE_SENSITIVITY_THRESHOLD:
@@ -120,7 +118,6 @@ def char_score(self, word, debug=False):
 
         Enabling <debug> allows inspecting individual transition probabilities.
         """
-        #word = word.decode("utf-8")
         ngrams = self.word2ngrams(word, self.n)
         logp = 0
         # Add starting probability
@@ -141,7 +138,7 @@ def char_score(self, word, debug=False):
                 logp += self.trans_prob[ngrams[i]][ngrams[i+1]]
                 debugstr += " " + ngrams[i] + " " + str(self.trans_prob[ngrams[i]][ngrams[i+1]])
         if debug:
-            print debugstr
+            print(debugstr)
         return logp
 
     def train(self, smooth_lambda=0.001):
@@ -166,12 +163,11 @@ def train(self, smooth_lambda=0.001):
         charset = set()
 
         n = self.n
-        print "Training " + str(n) + "-gram model for language: " + self.language
+        print("Training %d-gram model for language: %s" % (n, self.language))
 
         # Calculate counts
-        with open(self.lex_file) as f:
+        with codecs.open(self.lex_file, encoding='utf-8') as f:
             for line in f:
-                line = line.decode("utf-8")
                 fields = line.strip().split()
                 token = self.START + fields[0] + self.END
                 token_count = int(fields[1])
@@ -205,4 +201,4 @@ def train(self, smooth_lambda=0.001):
                     trans_count[ngram][next_ngram] / denominator)
             self.trans_prob[ngram][self.UNKNOWN] = math.log(lamb / denominator)
 
-        print "Model trained on " + str(token_total) + " tokens"
+        print("Model trained on %d tokens" % token_total)
diff --git a/word-level-language-id/corpora/LICENSE b/word-level-language-id/corpora/LICENSE
@@ -0,0 +1,7 @@
+Crúbadán language dataset (c) by Kevin Scannell
+
+This Crúbadán language dataset is licensed under a
+Creative Commons Attribution 4.0 International License.
+
+You should have received a copy of the license along with this
+work.  If not, see <http://creativecommons.org/licenses/by/4.0/>.