In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import networkx

In [2]:
# load spacy model (name entity recognition)
NER = spacy.load('en_core_web_sm')

In [3]:
# Load chapters
import os

# get all chapters in data dir
all_chapters = [b for b in os.scandir('data') if '.txt' in b.name]

In [4]:
#all_chapters

In [5]:
chapter = all_chapters[0]
chapter_text = open(chapter).read()
chap_doc = NER(chapter_text)

In [6]:
# viz identified entities (most wont be correct names or anything, but its a start)
displacy.render(chap_doc[0:1000], style='ent')

In [7]:
# Read Char names
character_df = pd.read_csv('charDF.csv')
character_df

Unnamed: 0.1,Unnamed: 0,character
0,0,Zorian Kazinski
1,1,Zach Noveda
2,2,Red Robe
3,3,Cikan Kazinski
4,4,Andir Kazinski
...,...,...
82,82,Kilnfather
83,83,Stonechild
84,84,Violeteye
85,85,Panaxeth


In [8]:
# create a first name column
import re
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(" ", 1)[0])

In [9]:
# get named entities per sentence
sent_entity_df = []

#loop thru each sentence, store named entity list for each
for sent in chap_doc.sents:
    sent_entities = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": sent_entities})

sent_entity_df = pd.DataFrame(sent_entity_df)

In [10]:
#filter out non characters

def filter_char(ent_list, char_df):
    return [ent for ent in ent_list
            if ent in list(char_df.character) 
            or ent in list(char_df.character_firstname)
    ]

In [11]:
filter_char(["Zorian", "2", "Xvim"], character_df)

['Zorian', 'Xvim']

In [12]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_char(x, character_df))

#remove sentences without character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered

Unnamed: 0,sentence,entities,character_entities
0,"(Chapter, 100Sacrifice, \n, Standing, in, Rea,...","[Chapter 100Sacrifice\nStanding, Zorian]",[Zorian]
5,"(Why, not, go, after, all, of, his, classmates...",[Raynie],[Raynie]
7,"(It, was, n’t, until, my, family, discovered, ...","[Cyoria, several days later, Raynie]",[Raynie]
13,"(Zorian, tried, .)",[Zorian],[Zorian]
24,"(“, No, offense, ,, Zorian, ,, but, I, ’m, sti...",[Zorian],[Zorian]
...,...,...,...
416,"(Raynie, loudly, gasped, and, swayed, unsteadi...","[Raynie, dozens]",[Raynie]
420,"(The, dimensional, gates, floating, above, the...","[Haslush, three]",[Haslush]
425,"(If, Raynie, was, doing, this, alone, ,, she, ...",[Raynie],[Raynie]
434,"(\n, -, break, -, \n, Sitting, next, to, a, ta...","[Xvim, Zorian]","[Xvim, Zorian]"


In [13]:
# only first names
sent_entity_df_filtered['character_firstname'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [ent.split(" ", 1)[0] for ent in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered['character_firstname'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [ent.split(" ", 1)[0] for ent in x])


In [14]:
pd.reset_option('^display.', silent=True)
sent_entity_df_filtered

Unnamed: 0,sentence,entities,character_entities,character_firstname
0,"(Chapter, 100Sacrifice, \n, Standing, in, Rea,...","[Chapter 100Sacrifice\nStanding, Zorian]",[Zorian],[Zorian]
5,"(Why, not, go, after, all, of, his, classmates...",[Raynie],[Raynie],[Raynie]
7,"(It, was, n’t, until, my, family, discovered, ...","[Cyoria, several days later, Raynie]",[Raynie],[Raynie]
13,"(Zorian, tried, .)",[Zorian],[Zorian],[Zorian]
24,"(“, No, offense, ,, Zorian, ,, but, I, ’m, sti...",[Zorian],[Zorian],[Zorian]
...,...,...,...,...
416,"(Raynie, loudly, gasped, and, swayed, unsteadi...","[Raynie, dozens]",[Raynie],[Raynie]
420,"(The, dimensional, gates, floating, above, the...","[Haslush, three]",[Haslush],[Haslush]
425,"(If, Raynie, was, doing, this, alone, ,, she, ...",[Raynie],[Raynie],[Raynie]
434,"(\n, -, break, -, \n, Sitting, next, to, a, ta...","[Xvim, Zorian]","[Xvim, Zorian]","[Xvim, Zorian]"


In [15]:
#create relationships
window_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])

    #remove duplicates if found
    char_unique = [char_list[i] for i in range(len(char_list)) if (i==0) or char_list != char_list[i-1]]

    if len(char_unique)>1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({'source': a, 'target': b})

In [16]:
relationship_df = pd.DataFrame(relationships)
relationship_df

Unnamed: 0,source,target
0,Zorian,Raynie
1,Raynie,Raynie
2,Raynie,Raynie
3,Raynie,Raynie
4,Raynie,Raynie
...,...,...
529,Zorian,Zorian
530,Xvim,Zorian
531,Zorian,Zorian
532,Xvim,Zorian
