In [4]:
import json
import pandas as pd

In [None]:
# What is a "change"? Could be any of these:
# 1. Number of edits
# 2. Number of deletions before/after
# 3. Number of paraphrases before/after
# 4. Increase/decrease in the number of edits

In [7]:
# Creates a list of dict matching the alignments before and after span-fixing
def match_changes(filename_before, filename_after, batch=None):
    with open(filename_before) as f, open(filename_after) as g:
        orig = json.load(f)
        fixed = json.load(g)

    print(f"Matching: {filename_before} ==> {filename_after}")

    matched_changes = []
    for fixed_set in fixed:
        # Find the matching sentence set between orig and fixed
        curr = 0
        while orig[curr]['ID'] != fixed_set['ID']:
            curr += 1
        
        # Combine the sentences within both sets
        orig_set = orig[curr]['Deletions'] + orig[curr]['Paraphrases'] + orig[curr]['Splittings']
        for fixed_sent in fixed_set['Deletions'] + fixed_set['Paraphrases'] + fixed_set['Splittings']:
            # Find the matching sentence between orig and fixed
            curr = 0
            while fixed_sent[1] != orig_set[curr][1]:
                curr += 1
            
            # Append the matched set the the final dict
            matched_changes.append({
                "Batch": batch,
                "Worker": fixed_set['Worker'],
                "Original": fixed_set['Original'],
                "Simplified": fixed_sent[0], 
                "System": fixed_sent[1],
                "Alignment Before": orig_set[curr][2:], 
                "Alignment After": fixed_sent[2:]
            })

    return matched_changes

In [43]:
matched = \
    match_changes("data/batch_1_1_span_fixing.json","data/batch_1_2_marcus_simplification_rating.json", batch=1) + \
    match_changes("data/batch_2_1_span_fixing.json","data/batch_2_2_simplification_rating.json", batch=2) + \
    match_changes("data/batch_3_1_span_fixing.json","data/batch_3_2_simplification_rating.json", batch=3)

Matching: batch_1_1_span_fixing.json ==> batch_1_2_marcus_simplification_rating.json
Matching: batch_2_1_span_fixing.json ==> batch_2_2_simplification_rating.json
Matching: batch_3_1_span_fixing.json ==> batch_3_2_simplification_rating.json


In [68]:
# Details the number of alignments before editsx
pd.Series([len(x['Alignment Before']) for x in matched]).describe()

count    1800.000000
mean        3.584444
std         2.614277
min         0.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        16.000000
dtype: float64

In [46]:
# Count the change in the number of edits
def count_len_changes(matched):
    len_changes = [len(edit['Alignment After']) - len(edit['Alignment Before']) for edit in matched]
    return len_changes

In [47]:
# Count the "Average number of changes"
def count_avg_changes(matched):
    # look at each edit from original to fixed
    # for each edit in the fixed
        # if it exists in the original, don't count
        # if it doesn't increment the change counter

    for edit in matched:
        edit['Changes'] = 0
        for span_fixed in edit['Alignment After']:
            found = False
            for span_before in edit['Alignment Before']:
                if span_fixed == span_before:
                    found = True
            if found:
                edit['Changes'] += 1

        # Account for a deleted change
        edit['Changes'] = len(edit['Alignment After']) + len(edit['Alignment Before']) - (edit['Changes'] * 2)

    return matched

In [53]:
# Distribution of the number of changes
pd.Series([x['Changes'] for x in count_avg_changes(matched)]).value_counts() / len(matched)

0     0.367778
2     0.146667
1     0.111667
3     0.105556
4     0.090556
5     0.062778
6     0.045556
7     0.022222
8     0.018889
9     0.012222
10    0.008333
14    0.002778
11    0.002222
15    0.001111
13    0.001111
18    0.000556
dtype: float64

In [19]:
# Count the number of edits that had absolutely no changes
def count_no_changes(matched):
    no_changes = 0
    for edit in matched:
        same = True
        if len(edit['Alignment Before']) != len(edit['Alignment After']):
            same = False
        else:
            for i in range(len(edit['Alignment Before'])):
                for j in range(3):
                    if edit['Alignment Before'][i][j] != edit['Alignment After'][i][j]:
                        same = False
        if same:
            no_changes += 1
    return no_changes

In [20]:
# Count the change in deletions/paraphrases
def count_type_changes(matched, type_="del"):
    type_id = None
    if type_ == "del":
        type_id = 0
    elif type_ == "par":
        type_id = 1

    # count the deletions/paraphrases before
    total_type_change = []
    for edit in matched:
        types = 0
        for change in edit['Alignment Before']:
            if change[0] == type_id:
                types += 1
        for change in edit['Alignment After']:
            if change[0] == type_id:
                types -= 1
        total_type_change.append(types)
    return total_type_change

In [None]:
# Counts the number not changed
count_no_changes(matched)

647

In [57]:
# Statistics on the change in the length of edits
pd.Series(count_len_changes(matched)).describe()

count    1800.000000
mean       -0.582222
std         1.341150
min        -6.000000
25%        -1.000000
50%         0.000000
75%         0.000000
max         4.000000
dtype: float64

In [66]:
# Statistics on the change in the number of deletions
pd.Series(count_type_changes(matched, type_="del")).value_counts() / len(matched)

 0    0.672778
 1    0.129444
-1    0.122222
 2    0.030000
-2    0.026667
 3    0.008333
-3    0.005000
-4    0.002222
 4    0.001667
-7    0.001111
-5    0.000556
dtype: float64

In [64]:
# ^^ but for paraphrases
pd.Series(count_type_changes(matched, type_="par")).value_counts() / len(matched)

 0    0.548889
 1    0.228889
 2    0.100556
 3    0.046667
-1    0.038889
-2    0.015000
 4    0.013889
 5    0.003889
-3    0.001111
-4    0.001111
 6    0.001111
dtype: float64

In [30]:
# For David's project: Recover the highest rated paraphrase sentence in each batch
import json

def extract_sents(filename, batch=None):
    with open(filename) as f:
        annotations = json.load(f)

    return [{
        "Batch": batch,
        "ID": sent['ID'],
        "Original": sent['Original'],
        "Simplified": sent['Paraphrases'][0][1],
        "Rating": sent['Paraphrases'][0][0],
        "System": sent['Paraphrases'][0][2]
    } for sent in annotations]

extracted = \
    extract_sents("data/batch_2_3_marcus_final.json", batch=2) + \
    extract_sents("data/batch_3_3_marcus_final.json", batch=3)

with open('../phrase-alignment/code/top_sent_mounica.json', 'w', encoding='utf-8') as f:
    json.dump(extracted, f, ensure_ascii=False, indent=4)