In [1]:
import json
import os
import pandas as pd
from nltk.stem import WordNetLemmatizer

## Analysis of specific stories

Aggregating mentions of characters

In [18]:
translation = 'Kline_translation'

In [9]:
for story_name in os.listdir(translation):
    if story_name == 'full_text' or story_name == 'medea' or story_name.endswith('.csv'):
        continue

    curr_dir = translation + '/' + story_name
    story_char_info = {}
    
    # Read in the booknlp output
    with open(curr_dir + '/' + story_name + '.book') as f:
        book = json.load(f)
    
    # Read in the character indices
    with open(curr_dir + '/' + story_name + '_char_ids.json') as f:
        char_to_idx = json.load(f)
    
    # create dict of indices to characters
    idx_to_char = {idx: char_name for char_name, indices in char_to_idx.items() for idx in indices}

    # fill in the story_char_info dict
    for char_name in char_to_idx:
        story_char_info[char_name] = {
            'agent': [],
            'patient': [],
            'mod': [],
            'poss': [],
            'gender': ''
        }
    
    for char in book['characters']:
        if char['id'] in idx_to_char:
            char_name = idx_to_char[char['id']]
        else:
            continue
        
        story_char_info[char_name]['agent'] += char['agent']
        story_char_info[char_name]['patient'] += char['patient']
        story_char_info[char_name]['mod'] += char['mod']
        story_char_info[char_name]['poss'] += char['poss']
        
        if story_char_info[char_name]['gender'] == '' and char['g'] is not None:
            story_char_info[char_name]['gender'] = char['g']['argmax']
    
    # save the story_char_info dict as json
    with open(curr_dir + '/' + story_name + '_char_info.json', 'w') as f:
        json.dump(story_char_info, f)

Modify Agency Power Frames lexicon to add missing vocab

In [15]:
lexicon_path = '../lexicon/FramesAgencyPower/agency_power.csv'
lexicon = pd.read_csv(lexicon_path)

In [20]:
# get list of all verbs in lexicon
lemmatizer = WordNetLemmatizer()
lexicon_verbs = lexicon['verb'].tolist()
lexicon_verbs = [lemmatizer.lemmatize(verb, pos='v') for verb in lexicon_verbs]

# find verbs in stories missing from lexicon
missing_verbs = []
for story_name in os.listdir(translation):
    if story_name == 'full_text' or story_name == 'medea' or story_name.endswith('.csv'):
        continue
    
    with open(translation + '/' + story_name + '/' + story_name + '_char_info.json') as f:
        story_char_info = json.load(f)

    # get list of all verbs used in story by main characters
    all_verbs = []
    for char_name in story_char_info:
        all_verbs += [verb['w'] for verb in story_char_info[char_name]['agent']]
        all_verbs += [verb['w'] for verb in story_char_info[char_name]['patient']]

    # lemmatize verbs
    all_verbs = [lemmatizer.lemmatize(verb.lower(), pos='v') for verb in all_verbs]

    # find verbs in story that are not in lexicon
    missing_verbs += [verb for verb in all_verbs if verb not in lexicon_verbs]

In [36]:
# replace lexicon verbs with lemmatized forms
lexicon['verb'] = lexicon_verbs

# add missing verbs to lexicon
missing_verbs = list(set(missing_verbs))
missing_verbs = pd.DataFrame(missing_verbs, columns=['verb'])
missing_verbs['agency'] = ''
missing_verbs['power'] = ''

lexicon = pd.concat([lexicon, missing_verbs], ignore_index=True)

In [38]:
# save new lexicon to csv file
with open('../lexicon/FramesAgencyPower/agency_power_MODIFIED.csv', 'w') as f:
    lexicon.to_csv(f)

Get power and agency scores for select characters

In [13]:
lemmatizer = WordNetLemmatizer()

# load power agency lexicon
lexicon_path = '../lexicon/FramesAgencyPower/agency_power_MODIFIED.csv'
lexicon = pd.read_csv(lexicon_path)

agency_power_scores = {}
char_idx = 0
for story_name in ['tereus_philomena', 'dis_proserpina_cyane']:
    with open(translation + '/' + story_name + '/' + story_name + '_char_info.json') as f:
        story_char_info = json.load(f)
    
    for char in story_char_info:

        agency_power_scores[char_idx] = {
            'name': char,
            'story': story_name,
            'translation': translation,
            'agency': 0,
            'power': 0
        }
        agency_count = 0
        power_count = 0
        
        # get agency and power score for each verb the character is an agent of
        for verb in story_char_info[char]['agent']:
            verb = lemmatizer.lemmatize(verb['w'].lower(), pos='v')

            if not verb in lexicon['verb'].tolist():
                continue

            # agency score
            agency_type = lexicon[lexicon['verb'] == verb]['agency'].tolist()[0]
            agency_power_scores[char_idx]['agency'] += 1 if agency_type == 'agency_pos' else (-1 if agency_type == 'agency_neg' else 0)
            agency_count += 1

            # power score
            power_type = lexicon[lexicon['verb'] == verb]['power'].tolist()[0]
            agency_power_scores[char_idx]['power'] += 1 if power_type == 'power_agent' else (-1 if power_type == 'power_theme' else 0)
            power_count += 1
        
        # get power score for each verb the character is a patient of
        for verb in story_char_info[char]['patient']:
            verb = lemmatizer.lemmatize(verb['w'].lower(), pos='v')

            if not verb in lexicon['verb'].tolist():
                continue

            # power score
            power_type = lexicon[lexicon['verb'] == verb]['power'].tolist()[0]
            agency_power_scores[char_idx]['power'] += 1 if power_type == 'power_theme' else (-1 if power_type == 'power_agent' else 0)
            power_count += 1

        # calculate average agency and power scores
        agency_power_scores[char_idx]['agency'] /= agency_count
        agency_power_scores[char_idx]['power'] /= power_count

        char_idx += 1
    

In [14]:
# save agency power scores to csv file
agency_power_df = pd.DataFrame.from_dict(agency_power_scores, orient='index')
with open('agency_power_scores_more.csv', 'w') as f:
    agency_power_df.to_csv(f)

## General Analysis of whole text

In [5]:
path = 'drive/MyDrive/2022-2023/Ovid_Metamorphoses/'
with open(path + 'ovid.book', 'r') as f:
  book = json.load(f)

In [76]:
char_info = {}
for char in book['characters']:
  # skip if character is only referred to by pronouns
  if not (char['mentions']['proper'] or char['mentions']['common']):
    continue
  if char['g'] is None:
    continue

  id = char['id']
  char_info[id] = {
      'names': [name['n'] for name in char['mentions']['proper']] + [name['n'] for name in char['mentions']['common']],
      'gender': char['g']['argmax'],
      'total_agent': len(char['agent']),
      'total_object': len(char['patient'])
  }
  char_info[id]['name'] = char_info[id]['names'][0]

In [84]:
char_df = pd.DataFrame(char_info).T
char_df.reset_index(inplace=True)

In [85]:
char_df.drop(char_df[(char_df.total_agent + char_df.total_object) < 10].index, inplace=True)
char_df.reset_index(inplace=True, drop=True)

In [86]:
char_df

Unnamed: 0,index,names,gender,total_agent,total_object,name
0,5864,"[the goddess, The goddess]",she/her,98,25,the goddess
1,499,"[Jupiter, Ammon, mighty Jupiter, Jupiter Ammon...",he/him/his,76,21,Jupiter
2,605,"[Jove, Hector, Ajax, mighty Jove, brave Ajax, ...",he/him/his,71,20,Jove
3,1482,"[the god, The god]",he/him/his,99,12,the god
4,1538,"[the gods, The gods]",they/them/their,56,25,the gods
...,...,...,...,...,...,...
187,6252,"[the priest, The priest]",he/him/his,9,1,the priest
188,7104,[the Colchian witch],she/her,10,0,the Colchian witch
189,1023,[Lelex],he/him/his,9,1,Lelex
190,1685,[Venus ’s son],he/him/his,10,1,Venus ’s son


In [87]:
char_df['perc_agent'] = char_df['total_agent'] / (char_df['total_agent'] + char_df['total_object'])

In [88]:
char_df.to_csv(path + 'gender_info_2.csv')

In [89]:
# average agent per gender
genders = char_df['gender'].unique()
avg_gender_agent = {}
for gender in genders:
  gender_df = char_df[char_df['gender'] == gender]
  total_agent = gender_df['total_agent'].sum()
  total_object = gender_df['total_object'].sum()
  
  avg_gender_agent[gender] = total_agent / (total_agent + total_object)

In [90]:
avg_gender_agent

{'she/her': 0.7945454545454546,
 'he/him/his': 0.8196847366397539,
 'they/them/their': 0.7614942528735632}