In [None]:
import pandas as pd
import json

pd.set_option("max_colwidth", 0)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## Create a readable csv

In [None]:
# experiment_id = "master_set"
experiment_id = "prod_interface_test"
file_name = "20220406-013800_hits_assignments.csv"

df = pd.read_csv(f"./union_annotation/scripts/output/{experiment_id}/{file_name}")


In [None]:
SCAM_CONST = "probably scam"

known_worker_ids = {
    "A2ALTKZTHJC8EF": "eran",
    "A28IB5VURTLDJX": "valentina",
    "A2MLICSDSPTW1T": "aviv_or_royi",
}

rows = []
for hit_id, hit in df.groupby('HITId'):
    hit_parsed = {
        "HitId": hit_id,
    }
    
    worker_count = 1
    for _, assignment in hit.sort_values("WorkerId").iterrows():
        known_worker_id = known_worker_ids.get(assignment['WorkerId'], assignment['WorkerId'])
        is_scam = known_worker_id == SCAM_CONST
        if not is_scam:        
            hit_parsed[f"WorkerId_{worker_count}"] = known_worker_id
            answers = eval(assignment['answers'])
            hit_parsed[f"sentence1Text"] = answers['taskData']['sentence1Text']
            hit_parsed[f"sentence2Text"] = answers['taskData']['sentence2Text']        
            hit_parsed[f"exampleId"] = answers['taskData']['exampleId']
            hit_parsed[f"chosenSentenceId_{worker_count}"] = SCAM_CONST if is_scam else answers['chosenSentenceId'] 
            hit_parsed[f"highlightedPhrases_{worker_count}"] = SCAM_CONST if is_scam else answers['highlightedPhrases']
            hit_parsed[f"mergedText_{worker_count}"] = SCAM_CONST if is_scam else answers['mergedText']        
            hit_parsed[f"feedbackText_{worker_count}"] = SCAM_CONST if is_scam else answers['feedbackText']
            hit_parsed[f"skipped_{worker_count}"] = SCAM_CONST if is_scam else answers['skipped']
            worker_count += 1
        
    hit_parsed['count_workers'] = worker_count - 1
        
    rows.append(hit_parsed)

mdf = pd.DataFrame(rows)
mdf = mdf.reindex(sorted(mdf.columns), axis=1)
mdf = mdf.sort_values('count_workers', ascending=False)
mdf.to_csv(f"{experiment_id}_analyzed_{file_name}", index=False)
mdf

## Score turkers response

### Read turkers response

In [None]:
import pandas as pd

turkers_df = pd.read_csv("prod_interface_test_analyzed - prod_interface_test_analyzed_20220406-013800_hits_assignments.csv", skiprows=1)  # skiprows=1 because first row is file metadata (file_name)
turkers_df

### Read master set

In [None]:
master_df = pd.read_csv("master_set - master_set_consolidated_3.csv")
master_df = master_df.dropna(subset=['mergedText'])
master_df = master_df.drop_duplicates(subset=['exampleId'])
master_df

### Create clusters of spans from merged text and highlighted phrases

In [None]:
import spacy
from scipy.sparse.csgraph import connected_components
import numpy as np
import itertools
from copy import deepcopy
from typing import List, Tuple, Any


nlp = spacy.load("en_core_web_sm")

def find_new_spans(text, orig_text):
    """
    Finds spans in text that weren't in the original text
    """

    text = nlp(text)
    orig_text = nlp(orig_text)
    orig_text_words = {x.text: x for x in orig_text}
    new_words = [word for word in text if word.text not in orig_text_words]
    
    new_words = sorted(new_words, key=lambda x: x.idx)
    
    new_spans = []
    if any(new_words):
        span = [new_words[0]]
        for word in new_words[1:]:
            if word.i - span[-1].i <= 2:
                span.append(word)
            else:
                new_spans.append({
                    "span": span,
                    "id": len(new_spans)
                })
                span = [word]

        if any(span):
            new_spans.append({
                    "span": span,
                    "id": len(new_spans)
                })
                          
    return new_spans

def undo_tokenization(tokenized_text):
    """
    tokenization can make it harder to compare texts, especially since it is not consistent (turkers might decide not to tokenize), try undoing some stuff like "2 : 09 a . m ."  or "4 . 6"
    """
    
    import re
    
    def remove_space(text, start, end):
        text_before = text[:start]
        text_after = text[end:]
        text_to_change = text[start:end].replace(" ", "")
                              
        return f"{text_before}{text_to_change}{text_after}"
    
    NUMBERS_PATTERN = "\d+ [.:] \d+"
    DOT_PATTERN = "\w \."
    PATTERNS = [NUMBERS_PATTERN, DOT_PATTERN]
    
    found = True
    while found:
        found = False
        
        for pattern in PATTERNS:
            result = re.search(pattern, tokenized_text)
            if result is not None:            
                found = True
                tokenized_text = remove_space(tokenized_text, result.start(), result.end())
                break
                              
    return tokenized_text    
    

def create_highlighted_phrase_key(highlighted_phrase):
    return f"highlighted_span_{highlighted_phrase['id']}"

def create_new_span_key(new_span):
    return f"new_span_{new_span['id']}"

def equal_by_word(new_span, highlighted_phrase):
    phrase_tokens = [token.text.lower() for token in highlighted_phrase['phrase_parsed']]
    return any(token for token in new_span['span'] if token.text.lower() in phrase_tokens)


def similarity_by_rouge(new_span, highlighted_phrase):
    new_span_text = " ".join([token.text for token in new_span['span'] if not token.is_stop and not token.is_punct])
    highlighted_phrase_text = " ".join([token.text for token in highlighted_phrase['phrase_parsed'] if not token.is_stop and not token.is_punct])
    rouge_results = rouge_metric.compute(predictions=[new_span_text], references=[highlighted_phrase_text])
    rouge_results = {key: value.mid.fmeasure * 100 for key, value in rouge_results.items()}

    # print()
    # print(new_span_text)
    # print(highlighted_phrase_text)
    # print(rouge_results)
    
    return rouge_results['rougeL']

def keep_max_from_each_row(a):
    """
    See https://stackoverflow.com/questions/20295046/numpy-change-max-in-each-row-to-1-all-other-numbers-to-0
    """
    
    is_max = a == a.max(axis=1)[:,None]
    # we don't want 0s turning to 1 because the whole row is 0
    is_not_zero = a != np.zeros(a.shape[0])[:,None]
    return (is_max & is_not_zero).astype(int)

def create_idx_per_item(items_and_funcs: List[Tuple[Any, Any]]):
    """
    Create a unique running idx per item (necessary to create a matrix).
    param items_and_funcs: list of tuples of the items and the functions used to create a unique key that can be used in a dictionary
    """

    item_to_idx = {}
    idx_to_item = {}
    for items, func in items_and_funcs:
        for item in items:
            idx_to_item[len(item_to_idx)] = item
            item_to_idx[func(item)] = len(item_to_idx)
        
    return item_to_idx, idx_to_item
    

def create_clusters_from_new_and_highlighted_spans(highlighted_spans, new_spans):
    """
    New spans are spans that were found in the merged sentence and not in the chosen sentence.
    Highlighted phrases are those that the turker highlighted.
    Create clusters combining the two
    """

        
    clusters = {}
    if any(unique_span_to_idx):
        unique_span_to_idx, idx_to_span = create_idx_per_item([(highlighted_spans, create_highlighted_phrase_key), (new_spans, create_new_span_key)])
        
        matrix = np.zeros((len(unique_span_to_idx), len(unique_span_to_idx)))

        for new_span, highlighted_phrase in itertools.product(new_spans, highlighted_spans):
            matrix[unique_span_to_idx[create_new_span_key(new_span)], unique_span_to_idx[create_highlighted_phrase_key(highlighted_phrase)]] = similarity_by_rouge(new_span, highlighted_phrase)

        # Some words (e.g., miles) can show in multiple spans, but we want to match the span to the place it belongs best (max score)
        matrix = keep_max_from_each_row(matrix)
        n_components, labels = connected_components(csgraph=matrix, directed=False, return_labels=True)

        for component in range(n_components):
            items = []
            for i, label in enumerate(labels):
                if label == component:
                    items.append(idx_to_span[i])

            clusters[component] = {
                "items": items
            }
            
    return clusters
    

def align_new_and_highlighted_phrases(chosen_sent, merged_sent, highlighted_spans):
    """
    Clusters new spans and the highlighted spans
    """
    
    higlighted_spans = deepcopy(highlighted_spans)
    
    new_spans = find_new_spans(undo_tokenization(merged_sent), undo_tokenization(chosen_sent))
    
    # create unique ids to each highlhighted phrase for matrix
    for i, highlighted_phrase in enumerate(highlighted_spans):
        highlighted_phrase['type'] = 'highlighted_phrase'
        highlighted_phrase['id'] = i
        highlighted_phrase['phrase_parsed'] = nlp(undo_tokenization(highlighted_phrase['phrase']))
    
    clusters = create_clusters()

    return clusters


def test_align_new_and_highlighted_phrases__by_word():
    """
    In this test, even a comparison by word will work because there is no overlap in spans in word
    """
    
    sent2_example = "The quake occurred at 2 : 09 a . m . about 14 miles north - northeast of Healdsburg and had a depth of 1 . 2 miles ."
    merged_sent_example = "A 4.6 magnitude earthquake occurred at 2 : 09 a . m . in Northern California overnight Thursday, centered about 14 miles north - northeast of Healdsburg at a depth of 1 . 2 miles, shaking residents from San Francisco to north of Sacramento."
    highlighted_phrases_example = [{'phrase': '4 . 6 - magnitude', 'start': 2, 'end': 19, 'sentenceId': 1}, {'phrase': 'Northern California overnight Thursday', 'start': 39, 'end': 77, 'sentenceId': 1}, {'phrase': 'from San Francisco to north of Sacramento ', 'start': 98, 'end': 140, 'sentenceId': 1}]	

    result = align_new_and_highlighted_phrases(sent2_example, merged_sent_example, highlighted_phrases_example)
    pprint(result)
    assert len(result) == 3
    
test_align_new_and_highlighted_phrases__by_word()

def test_align_new_and_highlighted_phrases__by_rouge():
    """
    In this test, there is overlap in spans in word and puncuation
    """

    sent1_example = "A 4 . 6 - magnitude earthquake rattled Northern California overnight Thursday , shaking residents from San Francisco to north of Sacramento .	"
    highlighted_phrases_example = [{"phrase":"at 2 : 09 a . m","type":"new"},{"phrase":"about 14 miles north - northeast of Healdsburg","type":"new"},{"phrase":"had a depth of 1 . 2 miles ","type":"new"}]	
    merged_sent_example = "A 4 . 6 - magnitude earthquake, with a depth of 1.2 miles, rattled Northern California overnight at 2 : 09 a . m . on Thursday, about 14 miles north - northeast of Healdsburg , shaking residents from San Francisco to north of Sacramento ."

    result = align_new_and_highlighted_phrases(sent1_example, merged_sent_example, highlighted_phrases_example)
    pprint(result)
    assert len(result) == 3
    
# test_align_new_and_highlighted_phrases__by_rouge()

### Calculate turker's score

In [None]:
from dataclasses import dataclass
from dataclasses_json import dataclass_json
from datasets import load_dataset, load_metric
import json
from pprint import pprint

rouge_metric = load_metric("rouge")
ROUGE_SCORES = ["rouge1", "rouge2", "rougeL"]

comparison_results = []
num_turkers = 3

# Variables to control the different algorithms tried (otherwise it takes a long time to run)
align_new_and_highlighted_clusters = False
compare_master_merge_to_turker_merge = False
compare_entailing_to_entailed = True

def remove_overlapping_words(text_to_remove_from, text_to_check_for_overlap):
    return set(text_to_remove_from.split(" ")).difference(set(text_to_check_for_overlap.split(" ")))    

def get_sentence_from_row(row, sentence_id) -> str:
    return row['sentence1Text'] if sentence_id == 1 else row['sentence2Text']

def compare_entailing_to_entailed(entailing_sent, entailed_sents: List[str], prefix: str):
    """
    We would like to check the entailing sentence (e.g., merged sentence)
    1. entails both base sentences.
    2. does not hallucinate information not in the source sentence.
    """
    
    recall_results = []
    for entailed_sent in entailed_sents:
        rouge_results = rouge_metric.compute(predictions=[entailing_sent], references=[entailed_sent])
        curr_recall_results = {f"{prefix}_{key}_recall_entailing": value.mid.recall for key, value in rouge_results.items()}
        recall_results.append(curr_recall_results)

    recall_results = pd.DataFrame(recall_results).mean().to_dict()
    
    # For calculating precision we want to join the two base sentences
    rouge_results = rouge_metric.compute(predictions=[entailing_sent], references=[" ".join(entailed_sents)])
    precision_results = {f"{prefix}_{key}_precision_hallucinating": value.mid.precision for key, value in rouge_results.items()}
        
    return {**precision_results, **recall_results}

def compare_merged_row(merged_row):
    master_merge = merged_row['mergedText'] if isinstance(merged_row['mergedText'], str) else ""
    master_chosen_sentence_id = merged_row[f'chosenSentenceId']
    master_highlighted_phrases = eval(merged_row[f'highlightedPhrases']) if isinstance(merged_row[f'highlightedPhrases'], str) else []
    master_orig = get_sentence_from_row(merged_row, master_chosen_sentence_id)
    filtered_master_merge = remove_overlapping_words(master_merge, master_orig)    

    master_clusters = None
    if align_new_and_highlighted_clusters:
        master_clusters = align_new_and_highlighted_phrases(master_orig, master_merge, master_highlighted_phrases)
        
    master_entailing_results = {}
    if compare_entailing_to_entailed:
        master_entailing_results = compare_entailing_to_entailed(master_merge, [merged_row['sentence1Text'], merged_row['sentence2Text']], prefix="master")
    
    for i in range(num_turkers):
        turker_merge = merged_row[f'mergedText_{i + 1}'] if isinstance(merged_row['mergedText'], str) else ""
        turker_chosen_sentence_id = merged_row[f'chosenSentenceId_{i + 1}']
        turker_highlighted_phrases = eval(merged_row[f'highlightedPhrases_{i + 1}']) if isinstance(merged_row[f'highlightedPhrases_{i + 1}'], str) else []
        turker_orig = get_sentence_from_row(merged_row, merged_row[f'chosenSentenceId_{i + 1}'])
        filtered_turker_merge = remove_overlapping_words(turker_merge, turker_orig)

        align_new_and_highlighted_clusters_results = {}
        if align_new_and_highlighted_clusters:
            turker_clusters = align_new_and_highlighted_phrases(turker_orig, turker_merge, turker_highlighted_phrases)
            len_cluster_diff = len(master_clusters) - len(turker_clusters)
            master_highlighted_not_merged = len([cluster for cluster in master_clusters.values() if all(span.get('type') == 'highlighted_phrase' for span in cluster['items'])])
            master_merged_not_highlighted = len([cluster for cluster in master_clusters.values() if all(span.get('type') != 'highlighted_phrase' for span in cluster['items'])])
            turker_highlighted_not_merged = len([cluster for cluster in turker_clusters.values() if all(span.get('type') == 'highlighted_phrase' for span in cluster['items'])])
            turker_merged_not_highlighted = len([cluster for cluster in turker_clusters.values() if all(span.get('type') != 'highlighted_phrase' for span in cluster['items'])])
            turker_num_clusters = len(turker_clusters)

            align_new_and_highlighted_clusters_results = {
                "master_clusters": master_clusters,
                "turker_clusters": turker_clusters,
                "len_cluster_diff": len_cluster_diff,
                "master_highlighted_not_merged": master_highlighted_not_merged,
                "master_merged_not_highlighted": master_merged_not_highlighted,
                "turker_highlighted_not_merged": turker_highlighted_not_merged,
                "turker_merged_not_highlighted": turker_merged_not_highlighted,
                "turker_num_clusters": turker_num_clusters
            }


        compare_master_merge_to_turker_merge_results = {}
        if compare_master_merge_to_turker_merge:
            # Compare the final merged sentences by rouge
            rouge_results_filtered = rouge_metric.compute(predictions=[filtered_master_merge], references=[filtered_turker_merge])
            rouge_results_filtered = {f"{key}_filtered": value.mid.fmeasure * 100 for key, value in rouge_results_filtered.items()}

            # Compare the final merged sentences by rouge
            rouge_results = rouge_metric.compute(predictions=[master_merge], references=[turker_merge])
            rouge_results = {key: value.mid.fmeasure * 100 for key, value in rouge_results.items()}
                        
            compare_master_merge_to_turker_merge_results = {**rouge_results, **compare_master_merge_to_turker_merge_results}
            compare_master_merge_to_turker_merge_results = {**rouge_results_filtered, **compare_master_merge_to_turker_merge_results}
        
        turker_entailing_results = {}
        if compare_entailing_to_entailed:        
            turker_entailing_results = compare_entailing_to_entailed(turker_merge, [merged_row['sentence1Text'], merged_row['sentence2Text']], prefix="turker")
            
            # Compare turker results to master results
            turker_compared_to_master_entailing_results = {}
            for turker_entailing_key in turker_entailing_results:
                key_without_prefix = "_".join(turker_entailing_key.split('_')[1:])
                master_entailing_key = f"master_{key_without_prefix}"
                turker_compared_to_master_entailing_results[f"compare_{key_without_prefix}"] = master_entailing_results[master_entailing_key] - turker_entailing_results[turker_entailing_key]
                
            turker_entailing_results = {**turker_compared_to_master_entailing_results, **turker_entailing_results}
        
        # get existing score if already started scoring
        turker_score = merged_row.get(f'turker score {i + 1}')
        turker_feedback = merged_row.get(f'turker feedback {i + 1}')
        
        
        comparison_result = {
            "example_id": merged_row['exampleId'],
            "sentence1Text": merged_row['sentence1Text'],
            "sentence2Text": merged_row['sentence2Text'],
            "turker_score": turker_score,
            "turker_feedback": turker_feedback,
            "turker_chosen_sentence_id": turker_chosen_sentence_id,
            "turker_highlighted_phrases": turker_highlighted_phrases,            
            "turker_merge": turker_merge,
            "master_chosen_sentence_id": master_chosen_sentence_id,
            "master_highlighted_phrases": master_highlighted_phrases,
            "master_merge": master_merge,
            "filtered_master_merge": filtered_master_merge,
            "filtered_turker_merge": filtered_turker_merge,
        }

        comparison_result = {**align_new_and_highlighted_clusters_results, **comparison_result}        
        comparison_result = {**compare_master_merge_to_turker_merge_results, **comparison_result}                
        comparison_result = {**master_entailing_results, **comparison_result}        
        comparison_result = {**turker_entailing_results, **comparison_result}
        comparison_results.append(comparison_result)
        
merged_df = pd.merge(turkers_df, master_df, left_on="exampleId", right_on="exampleId", suffixes=("_turkers", "_master"))

# Filter specific (for testing)
# merged_df = merged_df[merged_df['exampleId'] == "10_17ecbplus_2__10_2ecbplus_2"]
# merged_df = merged_df.iloc[:1]
# merged_df = merged_df.iloc[1:2]

# Run
merged_df.apply(compare_merged_row, axis=1)

In [None]:
pd.DataFrame(comparison_results_copy[:int(size_to_fit)])

### Filter turkers

In [None]:
import random
import matplotlib.pyplot as plt
from collections import Counter

comparison_results_copy = comparison_results.copy()
random.Random(42).shuffle(comparison_results_copy)
size_to_fit = len(comparison_results_copy) # * 0.8


comparison_df = pd.DataFrame(comparison_results_copy[:int(size_to_fit)])
# comparison_df = comparison_df.dropna(subset='turker_score')


# metric = 'rouge1'
# metric = 'turker_rouge1_precision_hallucinating'
# metric = 'master_rouge1_precision_hallucinating'
# metric = 'turker_rouge1_recall_entailing'
# metric = 'master_rouge1_recall_entailing'
# metric = 'compare_rouge1_precision_hallucinating'
metric = 'compare_rouge1_recall_entailing'

# should_filter_different_chosen_sentence = True
should_filter_different_chosen_sentence = False
# should_filter_empty_merge_both_sides = True
should_filter_empty_merge_both_sides = False

if should_filter_different_chosen_sentence:
    comparison_df = comparison_df[comparison_df['turker_chosen_sentence_id'] == comparison_df['master_chosen_sentence_id']]

if should_filter_empty_merge_both_sides:
    comparison_df = comparison_df[(comparison_df['filtered_master_merge'] != comparison_df['filtered_turker_merge']) & (comparison_df['filtered_turker_merge'] == "")]

# plot
x = comparison_df[metric]
y = comparison_df['turker_score']
c = Counter(zip(x,y))
s = [10*c[(xx,yy)] for xx,yy in zip(x,y)]
plt.scatter(x, y, s=s)
plt.show()

print(comparison_df.shape)
print(comparison_df['turker_score'].value_counts())

comparison_df = comparison_df[abs(comparison_df[metric]) <= 0.1]
# comparison_df = comparison_df[abs(comparison_df[metric]) > 0.1]
# comparison_df = comparison_df[comparison_df['turker_score'] == 1.0]

print(comparison_df.shape)
print(comparison_df['turker_score'].value_counts())


# comparison_df[['example_id', 'turker_score', metric, 'turker_feedback', 'len_cluster_diff', 'turker_clusters', 'turker_merge', 'filtered_turker_merge', 'master_clusters', 'master_merge', 'filtered_master_merge']]
comparison_df['filtered_turker_merge'] = comparison_df['filtered_turker_merge'].apply(lambda x: sorted(x))
comparison_df['filtered_master_merge'] = comparison_df['filtered_master_merge'].apply(lambda x: sorted(x))
comparison_df[['example_id', 'turker_score', metric, 'turker_feedback', 'turker_merge', 'turker_chosen_sentence_id', 'filtered_turker_merge', 'master_merge', 'filtered_master_merge', 'turker_highlighted_phrases']].sort_values('example_id')



In [None]:
def create_cluster_key(cluster_key_and_value: Tuple[int, dict], prefix: str):
    return f"{prefix}_{cluster_key_and_value[0]}"

def create_cluster_key_master(cluster_key_and_value: Tuple[int, dict]):
    return create_cluster_key(cluster_key_and_value, "master")

def create_cluster_key_turker(cluster_key_and_value: Tuple[int, dict]):
    return create_cluster_key(cluster_key_and_value, "turker")

def similarity_by_highlighted_phrases(cluster_one, cluster_two):
    """
    Compare clusters by the highlighted phrases, which should be consistent if they chose the same start sentence
    """
    
    if cluster_one[1]['chosen_sentence_id'] != cluster_two[1]['chosen_sentence_id']:
        # probably should default to the rouge of the two sentences
        return -100  # TODO: change
    
    if cluster_one
    
    
    
    

def create_clusters_from_clusters(master_clusters, turker_clusters):
    """
    A good turker merge, although having the same new spans and highlighted phrases, might a different number of clusters than the master.
    For example, master highlighted separately the phrases and turker highlighted continuously.
    We want to create a joint cluster so we can properly compare if the turker missed anything or not.
    """
    
    cluster_to_idx, idx_to_cluster = create_idx_per_item([(master_clusters, create_cluster_key_master), (turker_clusters, create_cluster_key_turker)])
    
    similarity_func = similarity_by_highlighted_phrases
        
    clusters = {}
    if any(cluster_to_idx):
        matrix = np.zeros((len(cluster_to_idx), len(cluster_to_idx)))

        for master_cluster, turker_cluster in itertools.product(master_clusters, turker_clusters):
            matrix[cluster_to_idx[create_cluster_key_master(master_cluster)], cluster_to_idx[create_cluster_key_turker(turker_cluster)]] = similarity_func(master_cluster, turker_cluster)

        matrix = keep_max_from_each_row(matrix)
        n_components, labels = connected_components(csgraph=matrix, directed=False, return_labels=True)

        for component in range(n_components):
            items = []
            for i, label in enumerate(labels):
                if label == component:
                    items.append(idx_to_cluster[i])

            clusters[component] = {
                "items": items
            }
            
    return clusters

row = comparison_df.iloc[1]
for cluster in row['master_clusters'].values():
    cluster['chosen_sentence_id'] = row['master_chosen_sentence_id']
    cluster['type'] = 'master'

for cluster in row['turker_clusters'].values():
    cluster['chosen_sentence_id'] = row['turker_chosen_sentence_id']
    cluster['type'] = 'turker'    

def test_create_clusters_from_clusters():
    """
    In this test, master annotated 3 and turker 1, but these are the same and should result in 1 merged cluster
    """
    
    turker_clusters = {0: {'items': [{'span': ["killing", "5", "while", "leaving", "dozens", "injured"], 'id': 0}], 'chosen_sentence_id': 1, 'type': 'turker'}}
    master_clusters = {0: {'items': [{'phrase': 'Five dead', 'type': 'highlighted_phrase', 'id': 0, 'phrase_parsed': "Five dead"}], 'chosen_sentence_id': 1.0, 'type': 'master'}, 1: {'items': [{'phrase': 'dozens injured', 'type': 'highlighted_phrase', 'id': 1, 'phrase_parsed': "dozens injured"}, {'span': ["dozens"], 'id': 1}], 'chosen_sentence_id': 1.0, 'type': 'master'}, 2: {'items': [{'span': ["killing"], 'id': 0}], 'chosen_sentence_id': 1.0, 'type': 'master'}}

    result = create_clusters_from_clusters(list(row['turker_clusters'].items()), list(row['master_clusters'].items()))
    pprint(result)
    assert len(result) == 1
    
test_create_clusters_from_clusters()