In [15]:
#!pip install simpledorff
import simpledorff as sf
import pandas as pd
import json
import numpy as np
from glom import glom
pd.__version__
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.options.mode.chained_assignment = None  # default='warn'

In [16]:
def create_dataframe(json_path, annotator_int):
    # load data using Python JSON module
    with open(json_path,'r') as f:
        data = json.loads(f.read())
    # Flatten data, keep entry ID and person who completed the data
    df_base_list = pd.json_normalize(data, record_path =['annotations', 'result'], meta = [
        'id',
        ['annotations', 'completed_by'],
        ['annotations', 'id'],
    ], record_prefix = '_',
        errors = 'ignore'
                                      )

    # load original data separately
    with open(json_path,'r') as f:
        data = json.loads(f.read())
    # Flatten data, keep entry ID and person who completed the data
    df_original_data = pd.json_normalize(data, max_level=1, meta = ['id'], record_prefix = '_',
        errors = 'ignore'
                                      )

    # Only keep relevant columns from df_original_data
    series_id = df_original_data['id']
    series_data = df_original_data['data.text']
    df_original_data = pd.concat([series_id, series_data], axis=1)


    # Add original data to dataframe
    df_base_list = pd.merge(df_base_list, df_original_data, how='inner', left_on=['id'], right_on=['id'])

    # Only keep relevant columns from df_base_list
    series_datatype = df_base_list['_from_name']
    series_start = df_base_list['_value.start']
    series_end = df_base_list['_value.end']
    series_text = df_base_list['_value.text']
    series_label = df_base_list['_value.labels']
    series_speakerid = df_base_list['_meta.text']
    series_annotator = df_base_list['annotations.completed_by']
    series_id = df_base_list['annotations.id']
    series_original = df_base_list['data.text']
    df_base_list = pd.concat([series_datatype, series_start, series_end, series_text, series_label, 
                              series_speakerid, series_annotator, series_id, series_original], axis=1)

    # Rename remaining columns
    df_base_list.columns = ['data_type', 'start', 'end', 'text', 'label', 'speaker_id', 'annotator', 'id', 'original_data']

    # Anonymise annotator by giving them a number
    df_base_list['annotator'] = annotator_int

    # Remove useless lists in labels and convert to string
    df_base_list['label'] = df_base_list['label'].str[0]
    df_base_list['label'] = df_base_list['label'].astype(str)

    # Separate speakers and speeches
    df_speakers = df_base_list[df_base_list['label'] == "Speaker"]
    df_speeches = df_base_list[df_base_list['label'] != "Speaker"]

    #Clean speakers dataframe by removing unused columns
    df_speakers = df_speakers.drop(['data_type', 'start', 'end', 'label', 'annotator', 'id', 'original_data'], axis=1)
    # Remove useless list in speaker_id
    df_speakers['speaker_id'] = df_speakers['speaker_id'].str[0]

    # Replace NaN values with -1 values, so we can convert the columns to int
    df_speeches['id'] = df_speeches['id'].fillna(-1)
    df_speakers['speaker_id'] = df_speakers['speaker_id'].fillna(-1)

    # Convert both columns to integer so we can merge later
    df_speeches['id'] = df_speeches['id'].astype(int)
    df_speakers['speaker_id'] = df_speakers['speaker_id'].astype(int)

    # Merge speakers and speeches dataframes based on matching IDs
    df_final = pd.merge(df_speeches, df_speakers, how='inner', left_on=['id'], right_on=['speaker_id'])

    # Delete unused columns and rename new ones
    df_final = df_final.drop(['speaker_id_x', 'speaker_id_y'], axis=1)
    df_final.columns = ['data_type', 'start', 'end', 'text', 'label', 'annotator', 'id', 'original_data', 'speaker']

    # Remove speech lines and only keep those with emotions
    df_final = df_final[df_final['data_type'] == 'emotion']

    return df_final

In [17]:
def merge_dataframes(json_path_list):
    dataframes = []
    # Create the dataframes from json_path_list
    for idx, file in enumerate(json_path_list):
        df = create_dataframe(file, idx)
        dataframes.append(df)

    # Concatenate all dataframes into one
    df = pd.concat(dataframes)
    
    # Create rounded length of text to use as a margin of error when grouping
    df['rounded_length'] = (df['end'] - df['start']).round(-1)
    
    # Calculate Krippendorff's alpha
    kripp = sf.calculate_krippendorffs_alpha_for_df(df,experiment_col='rounded_length', annotator_col='annotator', class_col='label')
    print(kripp)
    
    # Group rows by original_data and length of text
    df = df.groupby(['original_data', 'rounded_length'])

    return df

In [20]:
json_list_1 = ['Jsonfiles/ch1_1.json', 'Jsonfiles/ch1_2.json', 'Jsonfiles/ch1_3.json']
json_list_16 = ['Jsonfiles/ch16_1.json', 'Jsonfiles/ch16_2.json', 'Jsonfiles/ch16_3.json']
json_list_17 = ['Jsonfiles/ch17_1.json', 'Jsonfiles/ch17_3.json', 'Jsonfiles/ch17_4.json']
json_list_18 = ['Jsonfiles/ch18_1.json', 'Jsonfiles/ch18_2.json']
json_list_19 = ['Jsonfiles/ch19_1.json', 'Jsonfiles/ch19_2.json', 'Jsonfiles/ch19_3.json', 'Jsonfiles/ch19_4.json']
json_list_20 = ['Jsonfiles/ch20_1.json', 'Jsonfiles/ch20_2.json', 'Jsonfiles/ch20_3.json']
json_list_21 = ['Jsonfiles/ch21_1.json', 'Jsonfiles/ch21_2.json', 'Jsonfiles/ch21_3.json', 'Jsonfiles/ch21_3.json']
json_list_22 = ['Jsonfiles/ch22_1.json', 'Jsonfiles/ch22_2.json', 'Jsonfiles/ch22_3.json']
json_list_23 = ['Jsonfiles/ch23_1.json', 'Jsonfiles/ch23_2.json', 'Jsonfiles/ch23_3.json', 'Jsonfiles/ch23_4.json']
json_list_24 = ['Jsonfiles/ch24_1.json', 'Jsonfiles/ch24_2.json', 'Jsonfiles/ch24_3.json']

chapter_1 = merge_dataframes(json_list_1)
chapter_16 = merge_dataframes(json_list_16)
chapter_17 = merge_dataframes(json_list_17)
chapter_18 = merge_dataframes(json_list_18)
chapter_19 = merge_dataframes(json_list_19)
chapter_20 = merge_dataframes(json_list_20)
# chapter_21 = merge_dataframes(json_list_21)
chapter_22 = merge_dataframes(json_list_22)
chapter_23 = merge_dataframes(json_list_23)
chapter_24 = merge_dataframes(json_list_24)

# use this line to display dataframe   
chapter_17.apply(lambda a: a[:])

chapters_dict = {
    "chapter_1": chapter_1.apply(lambda a: a[:]),
    "chapter_16": chapter_16.apply(lambda a: a[:]),
    "chapter_17": chapter_17.apply(lambda a: a[:]),
    "chapter_18": chapter_18.apply(lambda a: a[:]),
    "chapter_19": chapter_19.apply(lambda a: a[:]),
    "chapter_20": chapter_20.apply(lambda a: a[:]),
    # "chapter_21": chapter_21.apply(lambda a: a[:]),
    "chapter_22": chapter_22.apply(lambda a: a[:]),
    "chapter_23": chapter_23.apply(lambda a: a[:]),
    "chapter_24": chapter_24.apply(lambda a: a[:])
}

# Export dataframes as pickle files
for name, df in chapters_dict.items():
    df.to_pickle("pickled_files/" + name + ".pkl")

0.13510140405616222
0.14919852034525272
0.09013785790031814
0.4722222222222222
0.32815890502420286
0.291497975708502
0.1682926829268293
0.20194884287454318
0.16258351893095768


In [21]:
chapter_17.apply(lambda a: a[:])

Unnamed: 0,data_type,start,end,text,label,annotator,id,original_data,speaker,rounded_length
1,emotion,103,209,"Pendant sa vie , il était plein de douceur pou...",Dégoût,1,81,"— Aias , chefs des Argiens , et toi , Mèrionès...",Ménélaos,110
1,emotion,2,249,"Ainéias , prince des Troiens cuirassés , je vo...",Ardeur,2,39,"— Ainéias , prince des Troiens cuirassés , je ...",Hektôr,250
1,emotion,127,218,"J'irais et je défendrais Patroklos , car , en ...",Tristesse,0,52,"— Phoinix , mon père , vieillard vénérable , p...",Ménélaos,90
2,emotion,2,102,"Aias , chefs des Argiens , et toi , Mèrionès ,...",Colère,1,81,"— Aias , chefs des Argiens , et toi , Mèrionès...",Ménélaos,100
2,emotion,219,340,Mais la vigueur de Hektôr est comme celle du f...,Peur,0,52,"— Phoinix , mon père , vieillard vénérable , p...",Ménélaos,120
4,emotion,2,607,Ô dieux ! le plus insensé comprendrait mainten...,Tristesse,1,77,— Ô dieux ! le plus insensé comprendrait maint...,Télamônien Aias,600
4,emotion,2,67,Ô malheureux ! tu ne songes point à la mort qu...,Tristesse,0,20,— Ô malheureux ! tu ne songes point à la mort ...,Zeus,60
5,emotion,2,335,"Ainéias , comment sauveriez-vous la sainte Ili...",Dégoût,2,25,"— Ainéias , comment sauveriez-vous la sainte I...",Apollôn,330
5,emotion,608,679,Plût aux dieux qu'un de nous annonçât promptem...,Colère,1,77,— Ô dieux ! le plus insensé comprendrait maint...,Télamônien Aias,70
5,emotion,178,300,tu as tué son compagnon si doux et si courageu...,Colère,0,20,— Ô malheureux ! tu ne songes point à la mort ...,Zeus,120
