Skip to content

Commit

Permalink
Add example scripts and documentation.
Browse files Browse the repository at this point in the history
  • Loading branch information
eginhard committed Jun 22, 2017
1 parent e3844f8 commit 2b56312
Show file tree
Hide file tree
Showing 10 changed files with 100,144 additions and 50 deletions.
35 changes: 33 additions & 2 deletions README.md
@@ -1,2 +1,33 @@
# word-level-language-id
Simple word-level language ID using Viterbi based on unigram frequencies and character n-grams.
# Word-level language ID
Simple word-level language identification using the Viterbi algorithm based on unigram frequencies and character n-grams.

## Usage

I recommend using Python 3 for better Unicode support.

To quickly try out the system, corpora and language models are already included for British English and Irish. See below how to add new ones. You might want to do some post-processing on the lexicons because e.g. the Irish one contains some English as well and vice versa.

Run word-level language ID on some example sentences:

```bash
python word-level-language-id/identify.py
```

## Train new language models

Create or download a unigram frequency lexicon, e.g. from the [Crúbadán Project](http://crubadan.org/) which has those readily available for over 2000 languages.

For example, download and unzip British English and Irish:

```bash
wget http://crubadan.org/files/en-GB.zip
wget http://crubadan.org/files/ga.zip

unzip '*.zip' -d word-level-language-id/corpora
```

Train the language models.

```bash
python word-level-language-id/train.py
```
@@ -1,36 +1,36 @@
# -*- coding: utf-8 -*-

import json
import math
import sys

from LanguageModel import LanguageModel

class LanguageIdentifier:
"""Word level language identification.
"""

GA = "ga"
FR = "ga"
EN = "en"

# This string should be used for tokens that don't need a language
# assignment, e.g. punctuation.
IGNORE = "##IGNORE##"

def __init__(self,
model_file_ga, model_file_en,
lex_file_ga, lex_file_en,
fr, en,
model_file_fr, model_file_en,
lex_file_fr, lex_file_en,
lex_weight=1):
"""Initialises the language model.
Args:
model_file_ga/en (str): Irish/English LanguageModel file name
lex_file_ga/en (str): Irish/English Lexicon (1 word + frequency per line)
fr/en: Foreign/English language code.
model_file_fr/en (str): Foreign/English LanguageModel file name.
lex_file_fr/en (str): Foreign/English Lexicon (1 word + frequency per line).
lex_weight (float): Weight of the lexicon vs. the character model.
"""
self.lex_weight = lex_weight
self.model = {}
self.model[self.GA] = LanguageModel.load(model_file_ga, lex_file_ga, lex_weight)
self.model[self.FR] = LanguageModel.load(model_file_fr, lex_file_fr, lex_weight)
self.model[self.EN] = LanguageModel.load(model_file_en, lex_file_en, lex_weight)

def identify(self, tokens,
Expand All @@ -47,16 +47,16 @@ def identify(self, tokens,
transition_probability (float):
Probability that the following token will be in the same language.
start_probability (float):
Probability that the first token will be Irish
Probability that the first token will be Foreign
Returns:
languages (str[]): List of language assignments matching the token list.
E.g. ["ga", "ga", "en", ...]
E.g. ["fr", "fr", "en", ...]
"""

# Special treatment for the Irish affirmative "sea" in 1-word sentences.
if len(tokens) == 1 and tokens[0].lower() == "sea":
return [self.GA]
if self.FR == 'ga' and len(tokens) == 1 and tokens[0].lower() == "sea":
return [self.FR]

if method == "independent":
return self.identify_independent(tokens)
Expand All @@ -75,8 +75,8 @@ def identify_independent(self, tokens):
languages = []
for token in tokens:
scores = self.score(token)
if scores[self.GA] >= scores[self.EN]:
languages.append(self.GA)
if scores[self.FR] >= scores[self.EN]:
languages.append(self.FR)
else:
languages.append(self.EN)

Expand All @@ -98,19 +98,19 @@ def identify_viterbi(self, tokens,

# Probability of keeping vs. switching the language
trans_p = {}
trans_p[self.GA] = {}
trans_p[self.FR] = {}
trans_p[self.EN] = {}
trans_p[self.GA][self.GA] = transition_probability
trans_p[self.FR][self.FR] = transition_probability
trans_p[self.EN][self.EN] = transition_probability
trans_p[self.GA][self.EN] = 1- transition_probability
trans_p[self.EN][self.GA] = 1- transition_probability
trans_p[self.FR][self.EN] = 1 - transition_probability
trans_p[self.EN][self.FR] = 1 - transition_probability

# Initial probabilities for both languages
scores = self.score(tokens[0])
V[0][self.GA] = math.log(start_probability) + scores[self.GA]
V[0][self.FR] = math.log(start_probability) + scores[self.FR]
V[0][self.EN] = math.log(1 - start_probability) + scores[self.EN]

langs = [self.GA, self.EN]
langs = [self.FR, self.EN]

# Iterate over tokens (starting at second token)
for t in range(1, len(tokens)):
Expand Down Expand Up @@ -146,36 +146,36 @@ def score(self, word):
"""
# Punctuation etc. have no influence on the language assignment
if word == self.IGNORE:
return {self.GA: 1, self.EN: 1}
return {self.FR: 1, self.EN: 1}

lex_score, char_score = {}, {}
for lang in [self.GA, self.EN]:
for lang in [self.FR, self.EN]:
lex_score[lang] = math.exp(self.model[lang].lex_score(word))
char_score[lang] = math.exp(self.model[lang].char_score(word))

# Relative scores, only these can be weighted
lex_score_rel, char_score_rel = {}, {}
for lang in [self.GA, self.EN]:
lex_score_rel[lang] = lex_score[lang] / (lex_score[self.GA] +
for lang in [self.FR, self.EN]:
lex_score_rel[lang] = lex_score[lang] / (lex_score[self.FR] +
lex_score[self.EN])
char_score_rel[lang] = char_score[lang] / (char_score[self.GA] +
char_score_rel[lang] = char_score[lang] / (char_score[self.FR] +
char_score[self.EN])

weighted_score = {}
# If neither word is in the lexicon, use only the character model
if (lex_score[self.GA] == math.exp(self.model[self.GA].lex_score(LanguageModel.OOV)) and
if (lex_score[self.FR] == math.exp(self.model[self.FR].lex_score(LanguageModel.OOV)) and
lex_score[self.EN] == math.exp(self.model[self.EN].lex_score(LanguageModel.OOV))):
for lang in [self.GA, self.EN]:
for lang in [self.FR, self.EN]:
weighted_score[lang] = math.log(char_score_rel[lang])
# Else combine both models
else:
for lang in [self.GA, self.EN]:
for lang in [self.FR, self.EN]:
weighted_score[lang] = math.log(self.lex_weight * lex_score_rel[lang] +
(1 - self.lex_weight) * char_score_rel[lang])
#print word
#print("%.15f %.15f" % (lex_score[self.GA], lex_score[self.EN]))
#print("%.15f %.15f" % (char_score[self.GA], char_score[self.EN]))
#print("%.15f %.15f" % (lex_score_rel[self.GA], lex_score_rel[self.EN]))
#print("%.15f %.15f" % (char_score_rel[self.GA], char_score_rel[self.EN]))
#print("%.15f %.15f" % (weighted_score[self.GA], weighted_score[self.EN]))
#print("%.15f %.15f" % (lex_score[self.FR], lex_score[self.EN]))
#print("%.15f %.15f" % (char_score[self.FR], char_score[self.EN]))
#print("%.15f %.15f" % (lex_score_rel[self.FR], lex_score_rel[self.EN]))
#print("%.15f %.15f" % (char_score_rel[self.FR], char_score_rel[self.EN]))
#print("%.15f %.15f" % (weighted_score[self.FR], weighted_score[self.EN]))
return weighted_score
26 changes: 11 additions & 15 deletions LanguageModel.py → word-level-language-id/LanguageModel.py
@@ -1,13 +1,11 @@
# -*- coding: utf-8 -*-

import codecs
import json
import math
import sys

class LanguageModel:
"""Language model based on a lexicon and a character n-gram Markov model.
Description...
"""

# Symbols to mark start and end of a sequence.
Expand All @@ -26,7 +24,7 @@ class LanguageModel:
# but "Is" will rather start an Irish and "A" an English one.
CASE_SENSITIVITY_THRESHOLD = 4

def __init__(self, language, n, lex_file, lex_weight):
def __init__(self, language, n, lex_file, lex_weight=1):
"""Initialises the language model.
Args:
Expand Down Expand Up @@ -54,20 +52,21 @@ def dump(self, file_name=None):
"""Saves the language model to the specified file in JSON format"""
if file_name is None:
file_name = self.language + ".model"
with open(file_name, "wb") as f:
with open(file_name, "w") as f:
json.dump([self.language, self.n,
self.start_prob, self.trans_prob], f)
print("Saved model at: %s" % file_name)

@classmethod
def load(cls, model_file, lex_file, lex_weight):
def load(cls, model_file, lex_file, lex_weight=1):
"""Loads the language model from the specified file
Args:
model_file (str): LanguageModel file name
lex_file (str): Frequency lexicon file name
lex_weight (float): Weight of the lexicon vs. the character model.
"""
with open(model_file, "rb") as f:
with open(model_file) as f:
language, n, start_prob, trans_prob = json.load(f)
model = cls(language, n, lex_file, lex_weight)
model.start_prob = start_prob
Expand All @@ -89,11 +88,10 @@ def load_lexicon(self, lex_file):
"""
lamb = 0.1 # smoothing value

with open(lex_file) as f:
with codecs.open(lex_file, encoding='utf-8') as f:
lex = {}
total = 0
for line in f:
line = line.decode("utf-8")
fields = line.strip().split()
word = fields[0]
if len(fields[0]) >= self.CASE_SENSITIVITY_THRESHOLD:
Expand All @@ -120,7 +118,6 @@ def char_score(self, word, debug=False):
Enabling <debug> allows inspecting individual transition probabilities.
"""
#word = word.decode("utf-8")
ngrams = self.word2ngrams(word, self.n)
logp = 0
# Add starting probability
Expand All @@ -141,7 +138,7 @@ def char_score(self, word, debug=False):
logp += self.trans_prob[ngrams[i]][ngrams[i+1]]
debugstr += " " + ngrams[i] + " " + str(self.trans_prob[ngrams[i]][ngrams[i+1]])
if debug:
print debugstr
print(debugstr)
return logp

def train(self, smooth_lambda=0.001):
Expand All @@ -166,12 +163,11 @@ def train(self, smooth_lambda=0.001):
charset = set()

n = self.n
print "Training " + str(n) + "-gram model for language: " + self.language
print("Training %d-gram model for language: %s" % (n, self.language))

# Calculate counts
with open(self.lex_file) as f:
with codecs.open(self.lex_file, encoding='utf-8') as f:
for line in f:
line = line.decode("utf-8")
fields = line.strip().split()
token = self.START + fields[0] + self.END
token_count = int(fields[1])
Expand Down Expand Up @@ -205,4 +201,4 @@ def train(self, smooth_lambda=0.001):
trans_count[ngram][next_ngram] / denominator)
self.trans_prob[ngram][self.UNKNOWN] = math.log(lamb / denominator)

print "Model trained on " + str(token_total) + " tokens"
print("Model trained on %d tokens" % token_total)
7 changes: 7 additions & 0 deletions word-level-language-id/corpora/LICENSE
@@ -0,0 +1,7 @@
Crúbadán language dataset (c) by Kevin Scannell

This Crúbadán language dataset is licensed under a
Creative Commons Attribution 4.0 International License.

You should have received a copy of the license along with this
work. If not, see <http://creativecommons.org/licenses/by/4.0/>.

0 comments on commit 2b56312

Please sign in to comment.