/
LanguageIdentifier.py
181 lines (148 loc) · 7.03 KB
/
LanguageIdentifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
import math
from LanguageModel import LanguageModel
class LanguageIdentifier:
"""Word level language identification.
"""
FR = "ga"
EN = "en"
# This string should be used for tokens that don't need a language
# assignment, e.g. punctuation.
IGNORE = "##IGNORE##"
def __init__(self,
fr, en,
model_file_fr, model_file_en,
lex_file_fr, lex_file_en,
lex_weight=1):
"""Initialises the language model.
Args:
fr/en: Foreign/English language code.
model_file_fr/en (str): Foreign/English LanguageModel file name.
lex_file_fr/en (str): Foreign/English Lexicon (1 word + frequency per line).
lex_weight (float): Weight of the lexicon vs. the character model.
"""
self.lex_weight = lex_weight
self.model = {}
self.model[self.FR] = LanguageModel.load(model_file_fr, lex_file_fr, lex_weight)
self.model[self.EN] = LanguageModel.load(model_file_en, lex_file_en, lex_weight)
def identify(self, tokens,
method="viterbi",
transition_probability=0.78,
start_probability=0.75):
"""Word level language identification of a token sequence.
Args:
tokens (str[]): A list of tokens
method (str): One of [independent, viterbi]
VITERBI
transition_probability (float):
Probability that the following token will be in the same language.
start_probability (float):
Probability that the first token will be Foreign
Returns:
languages (str[]): List of language assignments matching the token list.
E.g. ["fr", "fr", "en", ...]
"""
# Special treatment for the Irish affirmative "sea" in 1-word sentences.
if self.FR == 'ga' and len(tokens) == 1 and tokens[0].lower() == "sea":
return [self.FR]
if method == "independent":
return self.identify_independent(tokens)
elif method == "viterbi":
return self.identify_viterbi(tokens,
transition_probability,
start_probability)
def identify_independent(self, tokens):
"""Independent word level language identification without any context.
For each token, picks the language with the higher probability according
to the language models.
"""
languages = []
for token in tokens:
scores = self.score(token)
if scores[self.FR] >= scores[self.EN]:
languages.append(self.FR)
else:
languages.append(self.EN)
return languages
def identify_viterbi(self, tokens,
transition_probability=0.9,
start_probability=0.5):
"""Context-dependent word level language identification using Viterbi.
Assigns the most likely language to each token according to both language
models and the likelihood of switching the language or not.
"""
languages = []
V = [{}] # Stores max probability for each token and language
S = [{}] # Stores argmax (most likely language)
# Probability of keeping vs. switching the language
trans_p = {}
trans_p[self.FR] = {}
trans_p[self.EN] = {}
trans_p[self.FR][self.FR] = transition_probability
trans_p[self.EN][self.EN] = transition_probability
trans_p[self.FR][self.EN] = 1 - transition_probability
trans_p[self.EN][self.FR] = 1 - transition_probability
# Initial probabilities for both languages
scores = self.score(tokens[0])
V[0][self.FR] = math.log(start_probability) + scores[self.FR]
V[0][self.EN] = math.log(1 - start_probability) + scores[self.EN]
langs = [self.FR, self.EN]
# Iterate over tokens (starting at second token)
for t in range(1, len(tokens)):
V.append({})
S.append({})
# Iterate over the two languages
scores = self.score(tokens[t])
for lang in langs:
# Get max and argmax for current position
term = (V[t-1][lang2] + math.log(trans_p[lang2][lang]) + scores[lang]
for lang2 in langs)
maxlang, prob = self.max_argmax(term)
V[t][lang] = prob
S[t][lang] = langs[maxlang]
# Get argmax for final token
languages = [0] * len(tokens)
languages[-1] = langs[self.max_argmax(V[-1][lang] for lang in langs)[0]]
# Reconstruct optimal path
for t in range(len(tokens)-1, 0, -1):
languages[t-1] = S[t][languages[t]]
return languages
def max_argmax(self, iterable):
"""Returns the tuple (argmax, max) for a list
"""
return max(enumerate(iterable), key=lambda x: x[1])
def score(self, word):
"""Returns the weighted log probability according to lexicon + character model.
"""
# Punctuation etc. have no influence on the language assignment
if word == self.IGNORE:
return {self.FR: 1, self.EN: 1}
lex_score, char_score = {}, {}
for lang in [self.FR, self.EN]:
lex_score[lang] = math.exp(self.model[lang].lex_score(word))
char_score[lang] = math.exp(self.model[lang].char_score(word))
# Relative scores, only these can be weighted
lex_score_rel, char_score_rel = {}, {}
for lang in [self.FR, self.EN]:
lex_score_rel[lang] = lex_score[lang] / (lex_score[self.FR] +
lex_score[self.EN])
char_score_rel[lang] = char_score[lang] / (char_score[self.FR] +
char_score[self.EN])
weighted_score = {}
# If neither word is in the lexicon, use only the character model
if (lex_score[self.FR] == math.exp(self.model[self.FR].lex_score(LanguageModel.OOV)) and
lex_score[self.EN] == math.exp(self.model[self.EN].lex_score(LanguageModel.OOV))):
for lang in [self.FR, self.EN]:
weighted_score[lang] = math.log(char_score_rel[lang])
# Else combine both models
else:
for lang in [self.FR, self.EN]:
weighted_score[lang] = math.log(self.lex_weight * lex_score_rel[lang] +
(1 - self.lex_weight) * char_score_rel[lang])
#print word
#print("%.15f %.15f" % (lex_score[self.FR], lex_score[self.EN]))
#print("%.15f %.15f" % (char_score[self.FR], char_score[self.EN]))
#print("%.15f %.15f" % (lex_score_rel[self.FR], lex_score_rel[self.EN]))
#print("%.15f %.15f" % (char_score_rel[self.FR], char_score_rel[self.EN]))
#print("%.15f %.15f" % (weighted_score[self.FR], weighted_score[self.EN]))
return weighted_score