In [None]:
import pandas as pd
import krippendorff
import itertools

# change for feedback
VERBOSE = True
DEBUG = False
SELECT_CODES = ["Relevance", "1", "2", "5", "3", "4"]

In [None]:
annotations = pd.read_csv('../data/external/final-annotations.csv')
annotations

In [None]:
initial_20 = pd.read_csv('../data/external/initial-20.csv')
initial_20

In [None]:
initial_20['For IAA'] = 1
initial_20['initial-20'] = 1
annotations = annotations.rename(columns={'Relevant?': 'Relevance'})
annotations = pd.concat([annotations, initial_20])
annotations = annotations[['For IAA', 'uid', 'Person', 'Certain Codes', 'Uncertain Codes', 'Relevance']]
annotations['For IAA'] = annotations['For IAA'].fillna(0)
missing_relevance = annotations[annotations['Relevance'].isna()]

if VERBOSE:
  print(f"There are {missing_relevance.shape[0]} assignments with missing relevance values.")

annotations = annotations.dropna(subset='Relevance')
num_cases = annotations["uid"].nunique()
annotations

In [None]:
for col in ['Certain Codes', 'Uncertain Codes']:
  annotations[col] = annotations[col].apply(lambda x: set(code.strip(
  ) for code in str(x).replace('.', '').split(',')) if not pd.isna(x) else set())

annotations['codes'] = annotations.apply(lambda row: set.union(
  row['Certain Codes'], row['Uncertain Codes']), axis=1)
annotations

In [None]:
all_unique_codes = list(set.union(*annotations['codes']))
all_unique_codes.sort()
for code in all_unique_codes:
  annotations[code] = annotations['codes'].apply(
    lambda x: 1 if code in x else 0)

annotations.to_csv('../data/interim/annotations.csv', index=False)
annotations

In [None]:
total_codes_per_annotator = annotations.drop(columns=['uid', 'Certain Codes', 'Uncertain Codes', 'codes', 'Relevance']).groupby('Person').sum()
total_codes_per_annotator.to_csv('../data/interim/annotator-codes.csv')
total_codes_per_annotator

In [None]:
coded_disclosure = annotations[(
  (annotations['0'] == 1) | (annotations['01'] == 1))]
nvdrs = pd.read_csv("../data/raw/nvdrs-youth-restricted.csv")[
    ['uid', 'DisclosedToSocialMedia']]
coded_disclosure = coded_disclosure.merge(
  nvdrs[['uid', 'DisclosedToSocialMedia']], on='uid', how='left')
count_coded_disclosure = coded_disclosure.shape[0]
count_provided_dislosure = coded_disclosure['DisclosedToSocialMedia'].sum()
proportion_agreement = count_provided_dislosure / count_coded_disclosure

if VERBOSE:
  print(f"{count_coded_disclosure} cases were coded for disclosure (0 or 0.1) by at least one person.")
  print(f"From these cases, {int(count_provided_dislosure)} were marked for 'DisclosedToSocialMedia'.")
  print(f"{(1 - proportion_agreement) * 100}% of cases that we found to include social media disclosure were not marked for 'DisclosedToSocialMedia'.")

In [None]:
unique_codes = pd.DataFrame(columns=['Code', "Krippendorff's Alpha"])

for code in SELECT_CODES:
  reliability_data = annotations[['Person', 'uid', code]][annotations['For IAA'] == 1].drop_duplicates(
    subset=['Person', 'uid']).pivot(index='Person', columns='uid', values=code)
  reliability_data_input = [[value for value in row]
                            for row in reliability_data.values]

  if annotations[annotations['For IAA'] == 1][code].sum() != 0:
    alpha = krippendorff.alpha(
      reliability_data=reliability_data_input, level_of_measurement='nominal')
  else:
    if VERBOSE:
      print(
        f"Code {code} does not have enough data to calculate Krippendorff's Alpha.")
    alpha = pd.NA

  unique_codes = unique_codes._append(
    {'Code': code, "Krippendorff's Alpha": alpha}, ignore_index=True)

unique_codes

In [None]:
def count_agreements(case: pd.DataFrame, code: str, majority: bool = False):
  case = case.reset_index(drop=True)
  agreed = 0
  disagreed = 0
  count_true = case[code].sum()
  count_false = case.shape[0] - count_true

  if DEBUG:
    print()
    print(case)

  combinations = list(itertools.combinations(list(range(case.shape[0])), 2))

  if DEBUG:
    print("Combinations: ", combinations)

  for combination in combinations:
    val1 = case.iloc[combination[0]][code]
    person1 = case.iloc[combination[0]]['Person']
    val2 = case.iloc[combination[1]][code]
    person2 = case.iloc[combination[1]]['Person']
    if val1 == val2:
      if DEBUG:
        print(f"{person1} agrees with {person2} ({val1},{val2})")
      agreed += 1
    else:
      if DEBUG:
        print(f"{person1} disagrees with {person2} ({val1},{val2})")
      disagreed += 1

  if majority:
    if count_true == count_false:
      return 0
    elif count_true > count_false:
      return 1
    else:
      return -1

  return agreed, disagreed


def prop_pairwise_agreement(annotations: pd.DataFrame, code: str):
  agreed = 0
  disagreed = 0
  IAA_annotations = annotations[annotations['For IAA'] == 1]
  cases = IAA_annotations[['uid', 'Person', code]]

  if cases[code].sum() == 0:
    return pd.NA

  cases = cases.groupby('uid')

  for case in cases:
    case_agreed, case_disagreed = count_agreements(case[1], code)

    if DEBUG:
      print(f"For code {code}, case {case[0]} has {case_agreed} agreements and {case_disagreed} disagreements.")

    agreed += case_agreed
    disagreed += case_disagreed

  if VERBOSE:
    print(f"Code {code} has {agreed} agreements and {disagreed} disagreements.")

  return agreed / (agreed + disagreed)


tied_cases = list(list())

unique_codes['Pairwise Agreement'] = unique_codes['Code'].apply(
  lambda code: prop_pairwise_agreement(annotations, code))
# unique_codes['Count'] = unique_codes['Code'].apply(
  # lambda code: annotations[code].sum())

if VERBOSE:
  print("Number of cases that were annotated:", num_cases)

# unique_codes['Prevalence'] = unique_codes['Count'] / num_cases
unique_codes.to_csv("../data/processed/codes.csv", index=False)
unique_codes