In [1]:
from src.data_processing import load_data
import itertools
import string
from src.pun_algorithms import *
from src.ngrams import *
from src.string_similarity import levenshtein
import operator
from src.data_processing import print_progress
from nltk import word_tokenize, pos_tag
from src.data_processing import load_cmu
from src.ipatoarpabet import translate
from string import punctuation
from src.pronunciations import phonetic_distance
import os
from pattern.en import lexeme
from src.pronunciations import get_closest_sounding_words as csw
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
task1, task2, task3, min_pairs, strings, pun_strings = load_data()

In [3]:
# model = models.KeyedVectors.load_word2vec_format('/home/doogy/Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [3]:
with open("data/ngram_searchspace/task1_totals.json") as f:
    search_space = json.load(f)

In [4]:
def score(original_frequency, new_frequency, original_word, new_word, position, ph_penalty=2):
    return ( (new_frequency - original_frequency)
           * ((phonetic_distance(original_word, new_word, translated=True)**ph_penalty) 
           * position)) # pos is normalised

def single_score(original_trigram, new_trigram, ph_penalty=2):
    original_freq = all_frequencies[original_trigram]
    new_freq = all_frequencies[new_trigram]
    og_word, new_word = original_trigram.split()[1], new_trigram.split()[1]
    return score(original_freq, new_freq, og_word, new_word, 1, ph_penalty)

def sort_answers(unsorted_dict):
    sd = {}
    for k, d in unsorted_dict.items():
        sd[k] = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return sd

def sentence_overlap(s1, s2):
    if len(s1) < len(s2):
        return sentence_overlap(s2, s1)
    return len([w for w in s2 if w in s1]) / len(s1)

In [None]:
single_score('to compliment')

In [5]:
accepted_pos = {'ADV', 'ADJ', 'VERB', 'NOUN'}
def rank_substitutions(index):
    
    print(index)
    
    full_path = "results/{}/{}".format(path, index)
    
    
    space = search_space[index]
    context = task1[index]['words']
    
    # takes in list of subs, context is list of words
    res = defaultdict(dict)
    context_length = len(context)

    for trigram, candidate in space.items():

        # No Pos experiment, set to 1
        position = context.index(trigram.split()[1])
        end_position = context_length - position
        
        # take position and normalise it wrt length of context
        if use_position:
            normal_position = position / context_length
        else:
            normal_position = 1
        
        original_freq = candidate['original_frequency']     
        original_word = trigram.split()[1].lower()
        
        if original_word in cmu:
            original_ph = cmu[original_word][0]
        else:
            # skip words not in new cmu
            continue
         
        if use_filter:
            phoneme_filter = set(csw(original_word))
            
        lexemes = lexeme(original_word)
        
        for sub, new_freq in candidate['substitutions'].items():
            
            new_word = sub.split()[1].lower()
            
            if use_filter:
                if new_word not in phoneme_filter:
                    continue
            
            # ignore lexical derivatives
            if new_word in lexemes:
                continue
            
            new_context = [w for w in context]
            new_context[position-1:position+2] = sub.split()
            
            
            if new_word in cmu:
                new_ph = cmu[new_word][0]
            else:
                # skip words not in new cmu
                continue
            
            if any([w in string.punctuation for w in new_word]):
                continue
                
            tags = ([w[1] for w in 
                     pos_tag(new_context, tagset='universal')])
            
            if tags[position] not in accepted_pos:
                continue

            s = score(original_freq, 
                      new_freq, 
                      original_ph,
                      new_ph,
                      normal_position,
                      ph_penalty=penalty)
            
            res[trigram][sub] = s
            
            
    f = open(full_path, 'w')
    json.dump(sort_answers(res), f, indent=4)
    f.close()
    
    return sort_answers(res)

# All Variants

In [None]:
import time
from multiprocessing import Pool

penalty = 8
for use_position in [True, False]:
    for use_filter in [True, False]:
        
        if use_position and use_filter:
            path = "phonetic_filter_with_pos"
        if use_position and not use_filter:
            path = "all_trigram_with_pos"
        if not use_position and use_filter:
            path = "phonetic_filter_no_pos"
        if not use_position and not use_filter:
            path = "all_trigram_no_pos"
        
        before = time.time()
        p = Pool(4)
        ngram_search_space = p.map(rank_substitutions, range(len(task1)))
        length = time.time() - before

        print("Total time taken in seconds: {}".format(length))

0
224
112
225
336
226
227
1
337
338
339
228
2
340
229
113
230
341
114
3
115
4
342
231
343
5
6
116
232
7
233
117
8
344
234
118
119
9
235
345
120
10
236
346
237
121
11
238
12
347
122
239
348
240
349
241
123
13
14
350
15
242
243
124
16
244
125
17
245
18
246
351
352
247
19
126
248
353
20
249
21
354
355
250
22
356
127
251
23
24
128
25
357
252
358
129
26
130
27
359
28
29
253
360
131
30
132
31
361
133
32
254
362
255
256
363
134
257
33
135
364
258
365
136
366
34
367
259
35
260
368
36
369
37
137
261
262
38
138
139
263
370
39
371
372
140
264
141
373
40
41
265
374
42
142
266
143
43
144
267
145
375
376
44
146
45
147
268
148
269
46
149
377
378
270
271
47
48
150
151
379
49
272
50
152
273
51
380
153
52
274
275
381
53
154
276
155
277
156
382
54
383
157
278
158
55
279
384
280
281
385
56
159
282
386
57
160
387
283
58
161
284
162
59
163
388
164
60
61
389
285
165
390
286
62
391
166
287
392
167
63
288
168
393
64
169
170
289
65
171
394
395
290
66
396
291
67
397
172
68
292
293
173
174
69
398
175
399
294
295


37
360
361
362
363
249
250
364
365
366
367
251
38
368
39
369
252
253
254
255
256
257
258
40
41
259
260
261
262
263
42
43
44
264
45
46
47
48
49
265
50
51
52
53
54
370
371
372
55
56
57
58
373
59
374
266
60
61
267
268
62
375
376
269
63
64
377
378
270
271
379
380
272
273
381
274
275
65
66
67
68
69
70
71
72
73
74
276
127
277
128
278
279
280
281
129
130
75
131
132
282
76
77
78
79
80
81
133
283
284
82
83
84
85
86
134
382
383
384
385
386
387
388
135
389
285
390
391
392
393
394
395
286
87
396
397
398
287
288
399
400
401
402
403
404
88
405
89
406
407
408
409
90
91
92
289
290
291
93
94
95
292
293
96
136
97
98
99
294
295
296
297
100
101
102
298
103
104
137
105
410
299
411
300
301
412
302
138
139
413
414
303
304
106
107
140
108
141
415
416
109
305
417
306
307
110
111
418
308
419
142
309
448
449
420
450
310
451
452
311
312
421
422
423
424
425
426
143
144
145
313
314
315
453
427
454
455
456
428
429
457
458
430
316
459
460
461
462
463
431
432
433
434
317
464
318
465
146
147
148
466
467
149
150
151
152

Process ForkPoolWorker-8:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 125, in worker
    put((job, i, result))
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 349, in put
    obj = ForkingPickler.dumps(obj)
  File "/usr/lib/python3.5/multiprocessing/reduction.py", line 50, in dumps
    cls(buf, protocol).dump(obj)
MemoryError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 129, in worker
    wrapped))
MemoryError
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 87, in __str__
    self.exc)
Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", 

1714
1715
1716
1490
1595
1596
1491
1492
1493
1717
1494
1718
1495
1496
1497
1498
1499
1719
1720
1500
1501
1721
1722
1502
1723
1724
1725
1503
1726
1727
1728
1729
1504
1505
1506
1730
1731
1732
1733
1507
1508
1509
1597
1598
1734
1735
1736
1510
1599
1511
1512
1513
1514
1737
1515
1600
1601
1602
1603
1738
1739
1740
1741
1516
1517
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1518
1519
1520
1521
1522
1523
1524
1525
1526
1604
1605
1527
1528
1529
1530
1531
1532
1754
1533
1755
1756
1757
1758
1534
1535
1759
1606
1607
1608
1609
1760
1761
1610
1762
1611
1612
1613
1763
1764
1536
1537
1765
1538
1766
1614
1615
1539
1616
1617
1540
1618
1541
1542
1543
1544
1545
1546
1547
1548
1767
1768
1769
1770
1771
1772
1549
1550
1773
1774
1551
1775
1776
1552
1553
1554
1555
1556
1557
1777
1558
1559
1560
1561
1778
1779
1562
1563
1564
1565
1566
1567
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
