In [3]:
from collections import Counter

In [4]:
import Levenshtein

In [5]:
import difflib
import pandas as pd
import numpy as np

In [6]:
# Get the edit distance representation
def get_edit_distance_repr(components, blend):
    add=False
    output=[]
    for c in difflib.ndiff(components, blend):
        if c[0]=='-':
            output.append('D')
        elif c[0]=='+':
            add = True
            output.append(c[1:].strip())
        else:
            output.append('C')
    return ''.join(output), add

In [7]:
# Get the string from the edit distance encoding
def get_from_copy_edit(example, copyedits):
    output_str=[]
    for i,s in enumerate(list(example)):
        if copyedits[i] == 'C':
            output_str.append(s)
    return u''.join(output_str)

In [10]:
# Get the evaluation scores
def get_scores(gold_preds_file, output_preds_file):
    gold_preds=[]
    for l in open(gold_preds_file):
        gold_preds.append(l.strip().split(':')[-1])
        
    output_preds=[]
    for l in open(output_preds_file):
        output_preds.append(l.strip().split(':')[-1])
    print(gold_preds[:10], output_preds[:10])
    edit_scores=[Levenshtein.distance(unicode(p),unicode(g)) for p, g in zip(output_preds, gold_preds) if g in gold_preds_ours]
    dfrecs=[(p, g, Levenshtein.distance(unicode(p),unicode(g))) for p, g in zip(output_preds, gold_preds) if g in gold_preds_ours]
    dfr=pd.DataFrame().from_records(dfrecs, columns=['p','g','d'])
    print("Mean Edit Distance", np.mean(edit_scores))

In [11]:
df = pd.read_csv('./data/blends_cmu.txt', header=None, names=['word','c1','c2','dataset'])

In [12]:
bdf=pd.read_csv('./data/components-blends-blind.csv', sep='\t', index_col=0)

In [13]:
gold_preds_ours=[get_from_copy_edit(s, t) for s, t in zip(bdf.source.values, bdf.target.values)]

In [14]:
len(gold_preds_ours)

1078

In [15]:
blind_set=set(df[df.dataset=='other'].word.values) - set(df[df.dataset=='knight'].word.values)

# Best forward (Exhaustive decoding)

In [16]:
get_scores("./data/best_blind_forward_gangal_gold.txt", "./data/best_blind_forward_gangal_preds.txt")

['shopathon', 'fashism', 'brick', 'wikiquette', 'alternawhore', 'companding', 'clownsident', 'enculturement', 'carjack', 'diamat'] ['shopparathon', 'fashism', 'brickberry', 'wikiquette', 'alternore', 'companding', 'clowident', 'enlulture', 'carjack', 'dialerialism']


NameError: name 'unicode' is not defined

# Best backward (Exhaustive decoding)

In [70]:
get_scores("./data/best_blind_backward_gangal_gold.txt", "./data/best_blind_backward_gangal_preds.txt")

['shopathon', 'fashism', 'brick', 'wikiquette', 'alternawhore', 'companding', 'clownsident', 'enculturement', 'carjack', 'diamat'] ['shoathon', 'fashism', 'brickberry', 'wiquette', 'alterhore', 'comprexpanding', 'closident', 'enlighture', 'carjack', 'dialerialism']
Mean Edit Distance 1.7787037037037037


# Knight's FST Baseline

In [71]:
aliya=pd.read_csv("./data/dataAliyaScraped_exact.csv", header=None, names=["w1", "w2", "pred"], sep=' ')
joint_words=[w1+'}'+w2 for w1,w2 in zip(aliya.w1.values, aliya.w2.values)]
aliya["full"]=joint_words
aliya=aliya[aliya.full.isin(set(bdf.source.values))]
bdf["full"]=bdf["source"]
bdf["gold_preds"]=gold_preds_ours
aliya=aliya.merge(bdf, on='full')
edit_scores=[Levenshtein.distance(unicode(p),unicode(g)) for p, g in zip(aliya.pred.values, aliya.gold_preds.values) if g in gold_preds_ours]
print "Baseline Edit Distance", np.mean(edit_scores)

Baseline Edit Distance 2.1076066790352503
