In [5]:
import json
import itertools
import copy
import random


def load_lexicon(lexicon_path, train_path):
    lexicon = json.load(open(lexicon_path))
    inputs = []
    with open(train_path, 'r') as f:
        for line in f:
            inputs.append(line.split('\t')[:2])
    return lexicon, inputs

def filter_uncommon_tokens(lexicon, threshold):
    # Filter uncommon tokens
    deleted_keys = set()
    
    for (k1, v1) in lexicon.items():
        deleted_codes = set()
        
        for c, count in v1.items():
            if count < threshold:
                deleted_codes.add(c)
        
        for k in deleted_codes:
            del v1[k]
            
        if len(v1) == 0:
            deleted_keys.add(k1)
            
    for k in deleted_keys:
        del lexicon[k]
        
    return lexicon


def filter_intersected_tokens(lexicon):
    deleted_keys = set()
    for (k1, v1) in lexicon.items():
        for ci, count in v1.items():
            for (k2, v2) in lexicon.items():
                if k2 == k1:
                    continue
                if ci in v2:
                    deleted_keys.add(k1)
                    deleted_keys.add(k2)
    for k in deleted_keys:
        del lexicon[k]
    return lexicon
    

def get_swapables(lexicon, inputs):
    inputs = copy.deepcopy(inputs)
    random.shuffle(inputs)
    swapables = {k: [] for k in lexicon.keys()}
    for k1 in lexicon.keys():
        for k2 in lexicon.keys():
            if k1 != k2:
                if k1 in swapables[k2]:
                    swapables[k1].append(k2)
                else:   
                    x1s = itertools.islice(filter(lambda x: k1 in x, inputs), 5000)
                    x2s = itertools.islice(filter(lambda x: k2 in x, inputs), 5000)
                    for (x1, x2) in itertools.product(x1s, x2s):
                        if ' ' in x1 and ' ' in x2 and x1.replace(k1, k2) == x2:
                            swapables[k1].append(k2)
                            print(f"Linked {k1} - {k2}")
                            break
    deleted_keys = set()               
    for k, v in swapables.items():
        if len(v) == 0:
            deleted_keys.add(k)
            
    for k in deleted_keys:
        del lexicon[k]
        del swapables[k]
             
    return (lexicon, swapables)

def propagate_swaps(swapables):
    
    for k1, swaps in swapables.items():
        for k2 in swaps:
            swaps2 = swapables[k2]
            if k1 in swaps2 and k2 not in swaps:
                swaps.append(k2)
            elif k2 in swaps and k1 not in swaps2:
                swaps2.append(k1)
    
    for k1, swaps in swapables.items():
        for k2 in swaps:
            for k3 in swapables[k2]:
                if k3 != k2 and k3 not in swaps:
                    swaps.append(k3)

    return swapables
    
  
def filter_lexicon_v2(lexicon, inputs):
    lexicon = copy.deepcopy(lexicon)
    lexicon = filter_uncommon_tokens(lexicon, 0) # len(inputs)/100)
    lexicon = filter_intersected_tokens(lexicon)
    lexicon, swapables = get_swapables(lexicon, inputs)
    return lexicon, propagate_swaps(swapables)

In [6]:
lexicon, inputs = load_lexicon("/raid/lingo/akyurek/git/align/COGS/cogs/alignments/intersect.align.o.json", "/raid/lingo/akyurek/git/align/COGS/cogs/train.tsv")

In [7]:
filtered_lexicon, swapables = filter_lexicon_v2(lexicon, [input[0] for input in inputs])

Linked rose - dog
Linked rose - sailor
Linked rose - boy
Linked rose - teacher
Linked rose - girl
Linked rose - cake
Linked rose - captain
Linked rose - cookie
Linked rose - ring
Linked rose - horse
Linked rose - mouse
Linked rose - strawberry
Linked rose - cat
Linked rose - book
Linked rose - giraffe
Linked rose - donut
Linked rose - box
Linked rose - pencil
Linked rose - deer
Linked rose - pen
Linked rose - butterfly
Linked rose - melon
Linked rose - bottle
Linked rose - priest
Linked rose - lawyer
Linked rose - bat
Linked rose - drink
Linked rose - child
Linked rose - penny
Linked rose - crayon
Linked rose - frog
Linked rose - citizen
Linked rose - sandwich
Linked rose - turtle
Linked rose - game
Linked rose - monkey
Linked rose - chalk
Linked rose - present
Linked rose - baby
Linked rose - chair
Linked rose - raisin
Linked rose - bee
Linked rose - brain
Linked rose - monster
Linked rose - flag
Linked rose - muffin
Linked rose - lion
Linked rose - scientist
Linked rose - balloon
Lin

In [10]:
swapables['baked']

['packed',
 'helped',
 'rolled',
 'disintegrated',
 'studied',
 'stabbed',
 'crumpled',
 'liked',
 'valued',
 'respected',
 'nursed',
 'missed',
 'painted',
 'appreciated',
 'slid',
 'laughed',
 'changed',
 'inflated',
 'decomposed',
 'smiled',
 'loved',
 'poked',
 'floated',
 'cried',
 'held',
 'doubled',
 'adored',
 'improved',
 'shortened',
 'noticed',
 'burned',
 'heard',
 'snapped',
 'found',
 'discovered',
 'pierced',
 'enlarged',
 'collapsed',
 'snored',
 'split',
 'tolerated',
 'examined',
 'reddened',
 'touched',
 'juggled',
 'admired',
 'screamed',
 'gasped',
 'tossed',
 'scoffed',
 'stuttered',
 'observed',
 'worshipped',
 'frowned',
 'snoozed',
 'baked',
 'squeezed',
 'shattered',
 'needed',
 'meant',
 'hoped',
 'preferred',
 'expected',
 'dreamed',
 'wanted',
 'attempted',
 'craved',
 'wished',
 'hated',
 'planned',
 'longed',
 'intended',
 'confessed',
 'tried',
 'itched',
 'enjoyed',
 'yearned',
 'said']

In [11]:
with open("/raid/lingo/akyurek/git/align/COGS/cogs/alignments/lexicon_and_swapables_v2.json","w") as f:
    json.dump({'lexicon': filtered_lexicon, 'swapables': swapables}, f)
    

In [12]:
json.load(open("/raid/lingo/akyurek/git/align/COGS/cogs/alignments/lexicon_and_swapables.json"))

{'lexicon': {'rose': {'rose': 648},
  'helped': {'help': 509},
  'dog': {'dog': 871},
  'sailor': {'sailor': 108},
  'boy': {'boy': 2029},
  'Emma': {'Emma': 4151},
  'rolled': {'roll': 1307},
  'teacher': {'teacher': 424},
  'Evelyn': {'Evelyn': 112},
  'girl': {'girl': 4625},
  'cake': {'cake': 4847},
  'forwarded': {'forward': 182},
  'Levi': {'Levi': 35},
  'Charlotte': {'Charlotte': 244},
  'captain': {'captain': 40},
  'needed': {'need': 115},
  'cookie': {'cookie': 1225},
  'passed': {'pass': 336},
  'ring': {'ring': 54},
  'beside': {'beside': 1801},
  'bed': {'bed': 266},
  'horse': {'horse': 186},
  'table': {'table': 1143},
  'mouse': {'mouse': 250},
  'Amelia': {'Amelia': 145},
  'strawberry': {'strawberry': 298},
  'cat': {'cat': 1139},
  'disintegrated': {'disintegrate': 122},
  'Eleanor': {'Eleanor': 37},
  'sold': {'sell': 693},
  'book': {'book': 242},
  'lended': {'lend': 1155},
  'Benjamin': {'Benjamin': 201},
  'giraffe': {'giraffe': 182},
  'donut': {'donut': 2074}