forked from hamilton/Rhymeless
-
Notifications
You must be signed in to change notification settings - Fork 0
/
backoff_model.py
106 lines (86 loc) · 2.81 KB
/
backoff_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from text_utils import TextUtils
class BackoffModel(TextUtils):
def __init__(self):
TextUtils.__init__(self)
self.firsts = {}
self.lasts = {}
self.unigram = {}
self.bigram = {}
self.trigram = {}
def generic_train(self, text):
"""Trains the model.
When instantiating a BackoffModel
(e.g. 'm = BackoffModel(model = "backwards"")')
the choice of model determines the type of
training that occurs.
This training method should get replaced by
classes that inherit the BackoffModel.
"""
# clean the text. Build the frequencies.
text = text.split('.')
for elem in text:
elem = self._clean_text(elem)
elem = [word for word in elem if word != '']
for i, word in enumerate(elem):
bigram = elem[i:i+2]
trigram = elem[i:i+3]
unigram = elem[i]
if i == 1 and elem[i] != '.':
self._train_first(elem[1])
self._train_unigram(unigram)
if len(bigram) == 2:
self._train_bigram(bigram)
if len(trigram) == 3:
self._train_trigram(trigram)
#######################################################################
#######################################################################
#### Private Methods ... ####
#######################################################################
#######################################################################
def _train_first(self, word):
if word not in self.firsts:
self.firsts[word] = 0
self.firsts[word] += 1
def _train_last(self, word):
if word not in self.lasts:
self.lasts[word] = 0
self.lasts[word] += 1
def _train_unigram(self, unigram):
if unigram not in self.unigram:
self.unigram[unigram] = 0
self.unigram[unigram] += 1
def _train_bigram(self, bigram):
if len(bigram) == 2:
# teach the model this bigram.
first, second = bigram[0], bigram[1]
if first not in self.bigram:
self.bigram[first] = {}
if second not in self.bigram[first]:
self.bigram[first][second] = 0
self.bigram[first][second] =+ 1
def _train_trigram(self, trigram):
if len(trigram) == 3:
# now teach the model this bigram.
first, second, third = trigram[0], trigram[1], trigram[2]
if first not in self.trigram:
self.trigram[first] = {}
if second not in self.trigram[first]:
self.trigram[first][second] = {}
if third not in self.trigram[first][second]:
self.trigram[first][second][third] = 0
self.trigram[first][second][third] += 1
def _sample(self, d):
m = sum(d[k] for k in d)
d = dict((k, log(d[k]) - log(m)) for k in d)
candidate = None
while candidate == None:
current_sample = sample(d, 1)[0]
r = log(random())
if d[current_sample] > r:
candidate = current_sample
else:
candidate = None
return candidate
if __name__=="__main__":
m = BackoffModel()
help(m)