In [16]:
from lib_analysis import read_and_parse_data, is_consistent
from lib_analysis import read_data

In [17]:
import os, json

files = os.listdir('samples')
for f in files:
    if f.endswith('json'):
        continue
    path = os.path.join('samples', f)
    data = read_and_parse_data(path)
    print(path)
    for scenario_id, scenario in data.items():
        scenario['text'] = scenario['clean_text']
        del scenario['clean_text']
        del scenario['scores']
    newf = path.split('.')[0] + '.json'
    json.dump(data, open(newf, 'w'))

samples/sample3-rater1.txt
samples/sample3-rater2.txt
samples/sample1-rater1.txt
samples/sample1-rater1-reann.xlsx
samples/sample1-rater2.txt
samples/sample2-rater2-reann.txt
samples/sample1-rater2-reann.xlsx
samples/sample2-rater1.txt
samples/sample2-rater2.txt
samples/sample2-rater1-reann.txt


# Calculate Cohen's Kappa for Interrater Reliability

The coding frame development was conducted in three rounds by two raters, who coded three sets of ten scenarios during each round. 

The first round only used a single code 'information' and yielded a Cohen's Kappa of 0.2971. After this round, the raters met and developed the initial coding frame, consisting of three sub-codes: simple, complex and question.

In the second round, the raters used the new coding frame to yield a Cohen's Kappa of 0.5746. After this round, the raters met to discuss differences and to refine their understanding of the sub-codes.

The third round evaluated their refined understanding to yield an above chance agreement of 0.7745.

In [23]:
from sklearn.metrics import cohen_kappa_score

files = [
    ['samples/sample1-rater1.json', 'samples/sample1-rater2.json'],
    ['samples/sample2-rater1.json', 'samples/sample2-rater2.json'],
    ['samples/sample2-rater1-reann.json', 'samples/sample2-rater2-reann.json'],
    ['samples/sample3-rater1.json', 'samples/sample3-rater2.json']
]

# for each round of annotations, compute Cohen's Kappa
round = 1
for f1, f2 in files:
    data1 = json.load(open(f1, 'r'))
    data2 = json.load(open(f2, 'r'))

    # check that each dataset contains the same number of scenarios
    scenario_ids1 = list(data1.keys())
    scenario_ids2 = list(data2.keys())
    if len(scenario_ids1) != len(scenario_ids2):
        print('Scenario ID length mismatch: %s, %s' % (f1, f2))
        continue
    
    # check that each dataset contains the same scenario IDs
    d1 = set(scenario_ids1) - set(scenario_ids2)
    if len(d1) > 0:
        print('Missing IDs from %s: %s' % (f2, d1))
        continue
    d2 = set(scenario_ids2) - set(scenario_ids1)
    if len(d2) > 0:
        print('Missing IDs from %s: %s' % (f1, d2))
        continue
    
    # compile an ordered list of codes for computing Kappa
    all_codes1 = []
    all_codes2 = []
    for scenario_id in scenario_ids1:
        all_codes1.extend(data1[scenario_id]['codes'])
        all_codes2.extend(data2[scenario_id]['codes'])
    
    # verify the number of codes is the same between datasets
    if len(all_codes1) != len(all_codes2):
        print('Code length mismatch: %s, %s' % (f1, f2))
        continue

    # compute Kappa
    kappa = cohen_kappa_score(all_codes1, all_codes2)
    print('Cohen\'s Kappa Round %i: %0.4f' % (round, kappa))
    round += 1

Cohen's Kappa Round 1: 0.2971
Cohen's Kappa Round 2: 0.5746
Cohen's Kappa Round 3: 0.6941
Cohen's Kappa Round 4: 0.7745
