In [None]:
from utils import *

data = load_data('../data/salsa-non-adjudicated/salsa_train.json')
systems, edit_types = set([x['system'] for x in data]), set(data[0]['annotations'].keys())

### Frequency & Composition of Structure Edits

In [None]:
subtypes = [
    'Voice Change',
    'Part-of-Speech Change',
    'Tense Change',
    'Grammatical Number',
    'Clausal Change'
]

fig, axs = plt.subplots(1, 2, figsize=(7, 3))
quality = np.array([35, 25, 25, 10, 5])
error = np.array([25, 30, 25, 15, 5])

color = color_mapping['structure']
scalar = [0.8, 1, 1.2, 1.4, 1.6]
colors = []
for s in scalar:
    colors += [colorscale(color, s)]

axs[0].pie(quality, colors=colors, labels=subtypes, labeldistance=None)
axs[1].pie(error, colors=colors, labels=subtypes, labeldistance=None)

axs[0].set_xlabel('Quality Structure Changes')
axs[1].set_xlabel('Error Structure Changes')
axs[1].legend(subtypes, loc='center left', bbox_to_anchor=(1, 0.5))
# plt.suptitle("Manual Breakdown of Structure Changes")

out_filename = "../paper/plot/appendix/structure-breakdown.pdf"
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)
plt.show() 

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))

wedgeprops = {
    "edgecolor" : "black",
    'linewidth': 0.7,
    'antialiased': True
}

labels, values = zip(*count_dataset_composite_edits(data, 'structure').items())
axs[0].pie(values, colors=[color_mapping[l] for l in labels], labels=labels, labeldistance=1.2, \
    wedgeprops=wedgeprops, autopct='%1.f%%', pctdistance=0.7, textprops={'fontsize': 10})

labels, values = zip(*count_dataset_composite_edits(data, 'split').items())
axs[1].pie(values, colors=[color_mapping[l] for l in labels], labels=labels, labeldistance=1.2, \
    wedgeprops=wedgeprops, autopct='%1.f%%', pctdistance=0.7, textprops={'fontsize': 10})

axs[0].set_xlabel('Structure Edits', fontsize=14)
axs[1].set_xlabel('Split Edits', fontsize=14)
# plt.suptitle("Child Edits of Composite Edits")

# axs[1].legend([x.capitalize() for x in labels], loc='center left', bbox_to_anchor=(1, 0.5),
#     handlelength=1,handleheight=1,facecolor='white')

out_filename = "../paper/plot/appendix/composite-breakdown.pdf"
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)
plt.show() 

### Error Rate Among Systems

In [None]:
not_human = [s for s in data if 'Human' not in s['system']]

all_edits = [i for j in [s['processed_annotations'] for s in not_human] for i in j]
error_edits = [e for e in all_edits if e['error_type'] != None]

print(f'% all edits that are errors: {(len(error_edits)/len(all_edits)):.2f}')

In [None]:
count = [sum([e['error_type'] != None for e in s['processed_annotations']]) for s in data]

human = [s for s in data if 'Human' in s['system']]
gpt_few = [s for s in data if 'few' in s['system']]
gpt_zero = [s for s in data if 'zero' in s['system']]
muss = [s for s in data if 'Muss' in s['system']]

# Percent of all sentences that contain an error
def error_rate(data, exclude_bad_deletion=False):
    count = 0
    if exclude_bad_deletion:
        for s in data:
            if any([e['error_type'] != None and e['error_type'] != Error.BAD_DELETION for e in s['processed_annotations']]):
                count += 1
    else:
        for s in data:
            if any([e['error_type'] != None for e in s['processed_annotations']]):
                count += 1
    return count / len(data)


all_edits = [i for j in [s['processed_annotations'] for s in data] for i in j]
error_edits = [e for e in all_edits if e['error_type'] != None]

print(f'% all edits that are errors: {(len(error_edits)/len(all_edits)):.2f}')
print(f'Error rate for all systems: {error_rate(data):.2f}')
print(f'Error rate for MUSS: {error_rate(muss):.2f}')
print(f'Error rate for GPT-zero: {error_rate(gpt_zero):.2f}')
print(f'Error rate for human/GPT-few: {error_rate(human):.2f} / {error_rate(gpt_few):.2f}')
print(f'Error rate for human/GPT-few excluding bad deletion: {error_rate(human, exclude_bad_deletion=True):.2f} / {error_rate(gpt_few, exclude_bad_deletion=True):.2f}')

In [None]:
class Tmp(Enum):
    NO_ERROR = 'No Error'

cust_systems = [s for s in systems if 'Human' not in s] + ['aggregated/human']

fig, ax = plt.subplots(1, len(cust_systems), figsize=(20, 3), sharey=True, gridspec_kw = {'wspace': 0.1, 'hspace':0})
for i, system in enumerate([s for s in all_system_labels if s in cust_systems]):
    selected = [sent for sent in data if sent['system'] == system]
    if system == 'aggregated/human':
        selected = [sent for sent in data if 'Human' in sent['system']]

    error_segmentation = {}
    for error in Error:
        error_segmentation[error] = [sent for sent in selected if any([ann['error_type'] == error for ann in sent['processed_annotations']])]
    error_segmentation[Tmp.NO_ERROR] = [sent for sent in selected if not any([ann['error_type'] == error for ann in sent['processed_annotations']])]

    pts = []
    pts += [(error, avg([s['score'] for s in error_segmentation[error]])) for error in error_segmentation.keys()]
    pts = sorted([p for p in pts if p[1] != 0], key=lambda x: x[1])

    cust_mapping = color_mapping
    cust_mapping[Tmp.NO_ERROR] = '#64C466'

    ax[i].axhline(0, linestyle='-', color='black', linewidth=0.3) 

    ax[i].bar([p[0].value for p in pts], [p[1] for p in pts], width, color=[cust_mapping[label] for label in [p[0] for p in pts]])
    ax[i].set_title(system_name_mapping[system])
    # ax[i].set_xticks([])
    ax[i].set_yticks(np.arange(-2, 2.5, 0.5))


for tick in [i for j in [x.get_xticklabels() for x in ax] for i in j]:
    tick.set_rotation(45)
    tick.set_horizontalalignment('right')

# fig.suptitle('Avg. Error Sent Scores')
ax[0].set_ylabel('Our Score')
# plt.legend([p[0].value for p in pts], loc='center left', bbox_to_anchor=(1.5, 0.5), ncol=3)
out_filename = f'../paper/plot/appendix/error-scores.pdf'
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)
plt.show()

### Character Length of Quality / Error Edits

In [None]:
families = [
    'elaboration',
    'generalization',
    'paraphrase',
    'split',
    'reorder',
    'structure',
]

class Tmp(Enum):
    NO_ERROR = 'No Error'

import scipy.stats as st
def ci(int_data):
    y1, y2 = st.t.interval(alpha=0.95, df=len(int_data)-1, loc=np.mean(int_data), scale=st.sem(int_data)) 
    # If it's too big, we just won't even return it
    if y1 <= 0.01 or math.isnan(y1):
        return 0
    return (y2-y1) / 2

width = 0.65
annotations = [x for y in [sent['processed_annotations'] for sent in data] for x in y]
fig, ax = plt.subplots(1, 2, figsize=(7, 4))

fam_size = [(family, avg([x['size'] for x in get_annotations_by_edit_family(data, family)]), ci([x['size'] for x in get_annotations_by_edit_family(data, family)])) for family in families]
pts = sorted([p for p in fam_size if p[1] != 0], key=lambda x: x[1], reverse=True)
ax[0].bar([p[0].capitalize() for p in pts], [p[1] for p in pts], width, yerr=[p[2] for p in pts], color=[color_mapping[label] for label in [p[0] for p in pts]])

anns = [ann for sent in data for ann in sent['processed_annotations']]
error_segmentation = {}
for error in Error:
    error_segmentation[error] = [a for a in anns if error == a['error_type']]
error_segmentation[Tmp.NO_ERROR] = [a for a in anns if error == None]
pts = [(error, avg([s['size'] for s in error_segmentation[error]]), ci([s['size'] for s in error_segmentation[error]])) for error in error_segmentation.keys()]
pts = sorted([p for p in pts if p[1] != 0], key=lambda x: x[1], reverse=True)
ax[1].bar([p[0].value for p in pts], [p[1] for p in pts], width, yerr=[p[2] for p in pts], color=[color_mapping[label] for label in [p[0] for p in pts]])

# ax[0].set_xlabel('Edit Family')
# ax[1].set_xlabel('Error Type')

ax[0].set_ylabel('Avg. Char Length')
ax[1].set_ylabel('')

# plt.suptitle('Avg. Edit Span Size by Type')
# ax[1].set_title('Avg. Error Span Size')

ax[0].set_yticks(np.arange(0, 0.085, 0.02))
ax[1].set_yticks(np.arange(0, 0.25, 0.04))

for tick in ax[0].get_xticklabels() + ax[1].get_xticklabels():
    tick.set_rotation(45)
    tick.set_horizontalalignment('right')

out_filename = f'../paper/plot/appendix/edit-sizes.pdf'
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)
plt.show()

### Sentence-level Scores Between Systems

In [None]:
# & -- & -- & -- & -- & -- & -- & -- & -- \tabularnewline

# Print the avg. sentence scores for each system
table = ''

subscores = ['lexical', 'syntax', 'content', 'error', 'quality']

std = lambda x: round(np.std(x, ddof=1), 2)

for system in [s for s in all_system_labels if s in systems and 'Human' not in s] + ['aggregated/human']:
    selected = [sent for sent in data if sent["system"] == system]
    if system == 'aggregated/human':
        selected = [sent for sent in data if 'Human' in sent['system']]

    line = f'{system_name_mapping[system]} & '

    for score in subscores:
        selected_scores = [s["subscores"][score] for s in selected]
        line += f'{avg(selected_scores, 2):.2f} & {std(selected_scores):.2f} & '

    selected_scores = [s["score"] for s in selected]
    line += f'{avg(selected_scores, 2):.2f} & {std(selected_scores):.2f}'

    table += f'{line} \\tabularnewline\n'
print(table)

### Sentence Length Impacts Edit Frequency

In [None]:
# Load annotated ASSET data
import utils.dataloader_old as dl
asset = dl.load_data('../data/annotated', batch_num=[1, 2, 3, 4], preprocess=True)

In [None]:
for sent in data:
    sent['ed'] = edit_dist(sent['original'], sent['simplified'])

for sent in asset:
    sent['ed'] = edit_dist(sent['original'], sent['simplified'])

In [None]:
import random

n = 300

pts = []
for sent in data:
    pts += [(
        len(sent['processed_annotations']),
        sent['ed']
    )]
pts = random.sample(pts, n)
x, y = [p[0] for p in pts], [p[1] for p in pts]
plt.scatter(x, y, color='red', label='SimpEval', marker="*", alpha=0.5)

# a, b = np.polyfit(x, y, 1)
# plt.plot(x, a*np.asarray(x)+b, color='red', alpha=0.5)

pts = []
for sent in asset:
    pts += [(
        len(sent['processed_annotations']),
        sent['ed']
    )]
pts = random.sample(pts, n)
x, y = [p[0] for p in pts], [p[1] for p in pts]
plt.scatter(x, y, color='blue', label='ASSET', marker="P", alpha=0.5)

# a, b = np.polyfit(x, y, 1)
# plt.plot(x, a*np.asarray(x)+b, color='blue', alpha=0.5)

plt.xticks(np.arange(0, 27, 1))

# plt.title('Edit Distance vs. Number of Edits')
plt.xlabel('Number of Edits')
plt.ylabel('Edit Distance')
plt.legend()
out_filename = f'../paper/plot/appendix/edit-distance-num-edits.pdf'
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)
plt.show()

### Sentence Split Frequency

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(5, 5), sharey=True)

# If a bucket has less than this value, we don't include it
representative_sample_size = 25

for k, dataset in enumerate([data, asset]):
    bucket_size = 20
    start_amt = 0
    end_amt = 205
    pts = {}
    for i in range((end_amt - start_amt) // bucket_size):
        start_size = start_amt + i * bucket_size
        end_size = start_amt + (i + 1) * bucket_size

        selected = [s for s in dataset if start_size <= s['ed'] < end_size]

        amt_splits = {}
        for amt in range(1, 4):
            if amt not in amt_splits.keys():
                amt_splits[amt] = 0
            for sent in selected:
                if len([a for a in sent['edits'] if a['type'] == 'split']) == amt:
                    amt_splits[amt] += 1
        
        for amt in range(1, 4):
            if amt not in pts.keys():
                pts[amt] = []
            if len(selected) > representative_sample_size:
                pts[amt] += [(start_size, amt_splits[amt] / len(selected))]
            else:
                pts[amt] += [(start_size, 0)]
    bottom = [0 for _ in range(len(pts[1])-1)]
    for amt in range(1, 4):
        val = [p[1] for p in pts[amt][1:]]
        ax[k, 0].hist([p[0] for p in pts[amt]][:-1], [p[0] for p in pts[amt]], 
            bottom=bottom, weights=[p[1] for p in pts[amt][1:]], 
            color=color_mapping[f'split-{amt}'], edgecolor='black', linewidth=1.2)
        bottom = [b + p for b, p in zip(bottom, val)]


    bucket_size = 20
    start_amt = 75
    end_amt = 300
    pts = {}
    for i in range((end_amt - start_amt) // bucket_size):
        start_size = start_amt + i * bucket_size
        end_size = start_amt + (i + 1) * bucket_size

        selected = [s for s in dataset if start_size <= len(s['simplified']) < end_size]

        amt_splits = {}
        for amt in range(1, 4):
            if amt not in amt_splits.keys():
                amt_splits[amt] = 0
            for sent in selected:
                if len([a for a in sent['edits'] if a['type'] == 'split']) == amt:
                    amt_splits[amt] += 1
        
        for amt in range(1, 4):
            if amt not in pts.keys():
                pts[amt] = []
            if len(selected) > representative_sample_size:
                pts[amt] += [(start_size, amt_splits[amt] / len(selected))]
            else:
                pts[amt] += [(start_size, 0)]
    bottom = [0 for _ in range(len(pts[1])-1)]
    for amt in range(1, 4):
        val = [p[1] for p in pts[amt][1:]]
        ax[k, 1].hist([p[0] for p in pts[amt]][:-1], [p[0] for p in pts[amt]], 
            bottom=bottom, weights=[p[1] for p in pts[amt][1:]], 
            color=color_mapping[f'split-{amt}'], edgecolor='black', linewidth=1.2)
        bottom = [b + p for b, p in zip(bottom, val)]

# fig.suptitle('Proportion of Sentences with a Split Edit')
ax[0, 0].set_xticks([])
ax[0, 1].set_xticks([])
ax[1, 0].set_xlabel('Edit Distance')
ax[1, 1].set_xlabel('Original Sentence Length')
ax[0, 0].set_ylabel('% SimpEval w/ Split')
ax[1, 0].set_ylabel('% ASSET w/ Split')
ax[0, 0].set_yticks(np.arange(0, 1.1, 0.2))

fig.legend(labels=[
        '1 Split',
        '2 Splits',
        '3 Splits',
    ], loc='lower center', bbox_to_anchor=(0.54, -0.06), framealpha=1, frameon=False,
    handlelength=1, handleheight=1, ncol=3, handletextpad=0.5, columnspacing=0.7
)

out_filename = f'../paper/plot/appendix/split-edit-sizes.pdf'
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)
plt.show()

### SALSA Dataset Statistics

In [None]:
class Quality_Condition(Enum):
    ELABORATION='Elaboration'
    GENERALIZATION='Generalization'
    WORD_REORDER='Word-level Reorder'
    COMPONENT_REORDER='Component-level Reorder'
    SPLIT='Sentence Split'
    STRUCTURE='Structure Change'
    PARAPHRASE='Paraphrase'

class Error_Condition(Enum):
    BAD_DELETION='Bad Deletion'
    COREFERENCE='Coreference'
    REPETITION='Repetition'
    CONTRADICTION='Contradiction'
    FACTUAL_ERROR='Factual Error'
    IRRELEVANT='Irrelevant'
    BAD_WORD_REORDER='Bad Word-level Reorder'
    BAD_COMPONENT_REORDER='Bad Component-level Reorder'
    BAD_STRUCTURE='Bad Structure Change'
    BAD_SPLIT='Bad Sentence Split'
    COMPLEX_WORDING='Complex Wording'
    INFORMATION_REWRITE='Information Rewrite'
    GRAMMAR='Grammar Error'

class Trivial_Condition(Enum):
    TRIVIAL="Trivial Insertion"

In [None]:
def get_edits_from_condition(edits, condition):
    if condition == Quality_Condition.ELABORATION:
        return [e for e in edits if 
            e['information_impact'] == Information.MORE and
            e['type'] == Quality.QUALITY
        ]
    elif condition == Quality_Condition.GENERALIZATION:
        return [e for e in edits if 
            e['information_impact'] == Information.LESS and
            e['type'] == Quality.QUALITY
        ]
    elif condition == Quality_Condition.PARAPHRASE:
        return [e for e in edits if 
            e['information_impact'] == Information.SAME and
            e['edit_type'] == 'substitution' and
            e['type'] == Quality.QUALITY
        ]
    
    elif condition == Quality_Condition.SPLIT:
        return [e for e in edits if 
            e['edit_type'] == 'split' and
            e['type'] == Quality.QUALITY
        ]
    
    elif condition == Quality_Condition.WORD_REORDER:
        return [e for e in edits if 
            e['edit_type'] == 'reorder' and
            e['reorder_level'] == ReorderLevel.WORD and
            e['type'] == Quality.QUALITY
        ]
    
    elif condition == Quality_Condition.COMPONENT_REORDER:
        return [e for e in edits if 
            e['edit_type'] == 'reorder' and
            e['reorder_level'] == ReorderLevel.COMPONENT and
            e['type'] == Quality.QUALITY
        ]
    
    elif condition == Quality_Condition.STRUCTURE:
        return [e for e in edits if 
            e['edit_type'] == 'structure' and
            e['type'] == Quality.QUALITY
        ]

    if condition == Trivial_Condition.TRIVIAL:
        return [e for e in edits if 
            e['type'] == Quality.TRIVIAL
        ]

    edits = [e for e in edits if e['type'] == Quality.ERROR]

    if condition == Error_Condition.CONTRADICTION:
        return [e for e in edits if 
            e['error_type'] == Error.CONTRADICTION
        ]
    elif condition == Error_Condition.FACTUAL_ERROR:
        return [e for e in edits if 
            e['error_type'] == Error.FACTUAL
        ]
        
    elif condition == Error_Condition.IRRELEVANT:
        return [e for e in edits if 
            e['error_type'] == Error.IRRELEVANT
        ]

    elif condition == Error_Condition.REPETITION:
        return [e for e in edits if 
            e['error_type'] == Error.REPETITION
        ]
        
    elif condition == Error_Condition.BAD_DELETION:
        return [e for e in edits if 
            e['error_type'] == Error.BAD_DELETION
        ]
        
    elif condition == Error_Condition.COREFERENCE:
        return [e for e in edits if 
            e['error_type'] == Error.COREFERENCE
        ]
        
    elif condition == Error_Condition.INFORMATION_REWRITE:
        return [e for e in edits if 
            e['error_type'] == Error.INFORMATION_REWRITE
        ]
        
    elif condition == Error_Condition.BAD_SPLIT:
        return [e for e in edits if 
            e['error_type'] == Error.BAD_SPLIT
        ]
        
    elif condition == Error_Condition.BAD_WORD_REORDER:
        return [e for e in edits if 
            e['error_type'] == Error.BAD_REORDER and
            e['reorder_level'] == ReorderLevel.WORD 
        ]
        
    elif condition == Error_Condition.BAD_COMPONENT_REORDER:
        return [e for e in edits if 
            e['error_type'] == Error.BAD_REORDER and
            e['reorder_level'] == ReorderLevel.COMPONENT 
        ]
        
    elif condition == Error_Condition.BAD_STRUCTURE:
        return [e for e in edits if 
            e['error_type'] == Error.BAD_STRUCTURE
        ]
        
    elif condition == Error_Condition.COMPLEX_WORDING:
        return [e for e in edits if 
            e['error_type'] == Error.COMPLEX_WORDING
        ]        
    elif condition == Error_Condition.GRAMMAR:
        return [e for e in edits if 
            e['grammar_error'] == True
        ]
        

In [None]:
train_data = load_data('../data/inspection_rating_annotated', preprocess=True, adjudicated=True)
test_data = load_data('../data/test_set_inspection_rating_annotated', preprocess=True, adjudicated=True)
data = train_data + test_data

all_edits = [i for j in [s['processed_annotations'] for s in data] for i in j]
print(f"Total number of edits: {len(all_edits)}")

In [None]:
table = ''

# Quality edits
table += '\\multirow{7}{*}{\\rotatebox[origin=c]{90}{Quality Evaluation}}'
for condition in Quality_Condition:
    edits = get_edits_from_condition(all_edits, condition)
    table += f' & {condition.value} & {len(edits)} & {sum([x["token_size"] for x in edits])} & {avg([x["rating"] for x in edits], prec=2)} \\\\\n'

table += '\\\midrule\n\\multirow{15}{*}{\\rotatebox[origin=c]{90}{Error Evaluation}}'

for condition in Error_Condition:
    edits = get_edits_from_condition(all_edits, condition)
    table += f' & {condition.value} & {len(edits)} & {sum([x["token_size"] for x in edits])} & {-avg([x["rating"] for x in edits if x["rating"] is not None], prec=2)} \\\\\n'

edits = get_edits_from_condition(all_edits, Trivial_Condition.TRIVIAL)
table += f'\\\midrule\n & Trivial Change & {len(edits)} & {sum([x["token_size"] for x in edits])} & {0} \\\\\n'

print(table)