# Analyze scenarios from two raters

This notebook parses scenarios annotated using a pre-agreed coding frame to compute the inter-rater agreement Kappa statistic for above-chance agreement, and to review agreements and disagreements.

The notebook also creates a file that includes the labeled words to identify code mismatches between the two raters.

In [3]:
%pip install spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from dataclasses import dataclass
from lib_analysis import read_raw_sample

read_raw_sample("../datasets/scenarios1.json", 10, "../datasets/sample3.txt")

In [4]:
from lib_analysis import read_and_parse_data, is_consistent
from lib_analysis import read_data


data1 = read_and_parse_data('../datasets/sample1-TH.json')
data2 = read_and_parse_data('../datasets/sample1-vk-reann.json')

In [12]:
# extract all square brackets and write to excel
import re
# print(data1['MAS-G-0027'])
# print(type(data1))
# data1 = read_and_parse_data('../datasets/sample1-TH.json')
# data2 = read_and_parse_data('../datasets/sample2-TH.txt')
# data3 = read_and_parse_data('../datasets/sample3-TH.txt')
# data = data1 | data2 | data3
data = read_and_parse_data('../datasets/scenarios2_ex-TH.txt')
print(len(data))
phrases = []
for key, value in data.items():
    text = value['text']
    # find and extract all patterns in square brackets
    # phrases.append(re.findall(r'\[(.*?)\]', text))
    phrases += re.findall(r'\[(.*?)\]', text)
# write to excel
import pandas as pd
df = pd.DataFrame(phrases)
df.to_excel('scenario2.xlsx', index=False)


10


In [2]:
from lib_analysis import read_and_parse_data, is_consistent
from lib_analysis import read_data
data1 = read_and_parse_data('../datasets/scenarios1_ex-TH.txt')
print(data1['MAS-G-0040'])

{'scenario_id': 'MAS-G-0040', 'text': "This is the Reddit Is Fun mobile app for Reddit. The page is from the [subreddit] r/all where all the [hot posts of the day] are posted. My goal is to either get the [most important news] or [funny content] for the day or hour. On r/all, it would be for the day. If I switch the tab to top, there's [timeframes] that I can pick. I will usually pick 1 hour which will show me the [top content within the hour]. If I don't have time, I only scroll r/all. \n \n \n I can skim the [titles] for the ones I'm interested in reading further details about. Sometimes there may be a lot of [news] if it's a newsworthy day, and sometimes there isn't much [news] and it will be full of [entertainment posts]. I do this to keep up with [news] in one place while also being entertained. It's nice to just have one place to go to instead of a miriad of sites.\n \n", 'clean_text': "This is the Reddit Is Fun mobile app for Reddit. The page is from the subreddit r/all where al

In [26]:
import json, os
f1, f2 = open("../datasets/scenarios2.json",'r'), open("../scenarios2_new.json",'r')
s1, s2= json.load(f1), json.load(f2)
exist_set = []
for k in s1:
    exist_set.append(k['text'])
to_write = ""
id_prefix = "MAS-P-"
id_num = 51
for k in s2:
    if k['text'] not in exist_set:
        to_write += k['app_url'] + "\n" + id_prefix + f'{id_num:04}' + ": " + k['text'] + "\n\n"
        id_num += 1
if not os.path.exists('../scenarios2_new.txt'):
    with open('../scenarios2_new.txt', 'w') as f:
        f.write(to_write)
print(exist_set)
# import random, json, os
# def split_sample(filename, sample_size, dest, exclude_list):
#     data = ""
#     f = open(filename, 'r')
#     sample_list = json.load(f)
#     for sample in sample_list:
#         if sample['scenario_id'] not in exclude_list:
#             data += sample["app_url"] + "\n" + sample['scenario_id'] + ": " + sample["text"] + "\n\n"
#     if not os.path.exists(dest):
#         with open(dest, 'w') as f:
#             f.write(data)
# split_sample("../scenarios2_new.json", 103, "../scenarios2_new.txt", [])
# split_sample("../datasets/scenarios1.json", 100, "../datasets/scenarios1_ex.txt", sample)
# split_sample("../datasets/scenarios2.json", 50, "../datasets/scenarios2_ex.txt", sample)


["I purchase products and will sometimes leave reviews for them on this page, so some of the items I have purchased are on here and what I thought about them along with my name. They can click on my profile and see other things that I have bought and things that I rate good or bad. I share some personal information on these reviews like my height and weight ETC because some products arent accurate with this and I want other people to know that someone with similar build can use the product or not. Like for example there's an ab machine that I only purchased because someone had a similar build to me said that it worked fine with them, taller people need different machines because they have longer bodies. So I just want people to know how I feel about the products and if I have similar situations to them.", 'I use this screen to view past grocery orders I\'ve made and view/change my account settings. To get to this screen, I open the Instacart app on my phone. From the home page of the a

In [2]:
is_consistent(data1, data2)

Scenario IDs matched.


In [11]:
from sklearn.metrics import cohen_kappa_score
import csv

scenario_ids = list(data1.keys())
all_codes1 = [c for d in data1.values() for c in d['codes']]
all_codes2 = [c for d in data2.values() for c in d['codes']]
x = 0
for c in all_codes1:
    if c == 'b-i':
        x += 1
print(x)
# uncomment to compute kappa on non-BIO code format
#all_codes1 = ['o' if len(c) == 1 else c[2:] for d in data1.values() for c in d['codes']]
#all_codes2 = ['o' if len(c) == 1 else c[2:] for d in data2.values() for c in d['codes']]

kappa = cohen_kappa_score(all_codes1, all_codes2)
print('Cohen\'s Kappa, All Codes: %0.4f' % kappa)

# write the words and simplified codes for both datasets
# simplified codes: the b/i prefixes are removed
with open('coded_data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['word','rater1','rater2'])
    for scenario_id in scenario_ids:
        words = data1[scenario_id]['words']
        codes1 = ['o' if len(c) == 1 else c[2:] for c in data1[scenario_id]['codes']]
        codes2 = ['o' if len(c) == 1 else c[2:] for c in data2[scenario_id]['codes']]
        for i in range(len(words)):
            writer.writerow([words[i], codes1[i], codes2[i]])

99
Cohen's Kappa, All Codes: 0.7120


In [None]:
# count code distributions
possible_codes = sorted(list(set(all_codes1).union(set(all_codes2))))
tally = {'r1': {c:0 for c in possible_codes},
        'r2': {c:0 for c in possible_codes}}
for c in all_codes1:
    tally['r1'][c] += 1
for c in all_codes2:
    tally['r2'][c] += 1
print('\tRater1\tRater2')
for c in possible_codes:
    print('%s\t%s\t%s' % (c, tally['r1'][c], tally['r2'][c]))

In [4]:
# use simplified codes
flow_only1 = ['o' if len(c) == 1 else c[2:] for d in data1.values() for c in d['codes']]
flow_only1 = [c if c != 'i' else 'o' for c in all_codes1]

flow_only2 = ['o' if len(c) == 1 else c[2:] for d in data2.values() for c in d['codes']]
flow_only2 = [c if c != 'i' else 'o' for c in all_codes2]

kappa = cohen_kappa_score(flow_only1, flow_only2)
print('Cohen\'s Kappa, Flow, Only: %0.4f' % kappa)

Cohen's Kappa, Flow, Only: 0.6086


In [None]:
# index information types into tuples: i, j, score, phrase                      
def index_infotype(data):
    info = []
    phrase = []
    j = -1
    for i, (word, code) in enumerate(zip(data['words'], data['codes'])):
        if code == 'b-i':
            phrase = [word]
            j = i
        elif code == 'i-i':
            phrase.append(word)
        elif code == 'o' and j >= 0:
            info.append((j, j+len(phrase), data['scores'][len(info)], ' '.join(phrase)))
            phrase = []
            j = -1
    return info

# identify risk scores for overlapping information types
def overlaps(i1, j1, i2, j2):
    return len(set(range(i1, j1)).intersection(set(range(i2, j2)))) > 0

def find_overlaps(info1, info2):
    overlap = []
    for i1, j1, score1, phrase1 in info1:
        for i2, j2, score2, phrase2 in info2:
            if overlaps(i1, j1, i2, j2):
                overlap.append([(score1, phrase1), (score2, phrase2)])
    return overlap

agreed = 0
disagreed = 0
for scenario_id in data1.keys():
    info1 = index_infotype(data1[scenario_id])
    info2 = index_infotype(data2[scenario_id])
    overlap = find_overlaps(info1, info2)

    for i, ((s1, p1), (s2, p2)) in enumerate(overlap):
        print('\n%s, match %i: score %i, %s' % (scenario_id, i, int(s1), p1))
        print('%s, match %i: score %i, %s' % (scenario_id, i, int(s2), p2))
        
    agreed += len(overlap)
    disagreed += len(info1) - len(overlap) + len(info2) - len(overlap)

print('\nAgreed: %i' % agreed)
print('Disagreed: %i' % disagreed)

scores1 = [int(s) for d in data1.values() for s in d['scores']]
scores2 = [int(s) for d in data2.values() for s in d['scores']]  
print('\nScore average for Rater 1: %0.4f' % (sum(scores1) / len(scores1)))
print('Score average for Rater 2: %0.4f' % (sum(scores2) / len(scores2)))

In [None]:
# write the disagreements out to a file for inspection
disagreed = []

# format of data1/data2: sent_id, word_id, code
for x1, x2 in zip(data1, data2):
    if x1[2] != x2[2]:
        # record the sent_id, word_id, word, codes, plus the sentence
        sent = list(sentences[x1[0]])
        sent[x1[1]] = '[' + sent[x1[1]] + ']'
        disagreed.append([
            x1[0], x1[1], x1[2], x2[2], ' '.join(sent)
        ])

with open('disagreements.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['sent_id', 'word_id', 'code1', 'code2', 'sentence'])
    for row in disagreed:
        writer.writerow(row)