### Installations, if needed

In [None]:
!pip install spacy==2.0.12 # Above 2.0.12 doesn't seem work with the neuralcoref resolution (at least 2.0.13 and 2.0.16 don't)
!pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz # This is the coref language model
!pip install networkx
!pip install pydot # To draw our graphs in graphviz
!pip install graphviz

### Importing Libraries

In [1]:
import spacy
from spacy import displacy
from collections import Counter
import re
import os
import pandas as pd
import networkx as nx
import sys
import pydot
import matplotlib.pyplot as plt
#import graphviz
from graphviz import Source

### Loading the Text

In [23]:
TEXT_FILENAME = 'HP1.txt'
FILEPATH_TO_TEXTS = "../texts/"

with open(FILEPATH_TO_TEXTS + TEXT_FILENAME, mode='r', encoding='utf-8') as raw_text:
    text = raw_text.read()

### Dividing by chapters

We use the line breaks in identifying chapters and their chapter titles.

In [24]:
chapters = re.split(r"CHAPTER [A-Z]*[\n\r\s]*[A-Z\s]*[\n\r]",text)

Then we clean up extraneous line breaks.

In [25]:
for i, chapter in enumerate(chapters):
    cleaned_text = str(re.sub(r'[\s\r\n\t]+',' ', chapter))
    cleaned_text2 = cleaned_text.replace('\\','')
    chapters[i] = cleaned_text.strip("\\")

### Loading and running SpaCy English Medium-Sized Pipeline

You only need to download this once, and maybe not at all if we're just using the coreferenced version.

In [None]:
# Download the english medium-sized pipeline
! python -m spacy download en_core_web_md

Then we load the coreferenced pipeline.

In [26]:
nlp_coref = spacy.load('en_coref_md')
doc = nlp_coref(chapters[1])

In [27]:
doc._.coref_clusters

[Mr. and Mrs. Dursley, of number four, Privet Drive: [Mr. and Mrs. Dursley, of number four, Privet Drive, they, They, they],
 Mr. Dursley: [Mr. Dursley, He, he, Mrs. Dursley, she, her],
 The Dursleys: [The Dursleys, The Dursleys],
 their: [their, they, they, their, They, they, they],
 it: [it, it],
 Mrs. Dursley: [Mrs. Dursley, Mrs. Dursley, she, her, her],
 The Dursleys: [The Dursleys, The Dursleys],
 the Potters: [the Potters, the Potters, the Potters],
 the Potters had a small son, too: [the Potters had a small son, too, they, they],
 a small son: [a small son, him, This boy],
 Dudley: [Dudley, Dudley, Dudley, he, Dudley],
 Mrs. Dursley: [Mrs. Dursley, Mr. Dursley, he, his, Mrs. Dursley, she, his, Mr. Dursley, his, Mrs. Dursley, Mr. Dursley, he, He, his, he, Mr. Dursley, he, he, his, he, Mr. Dursley, It, Mr. Dursley, he, his, Mr. Dursley, himself, his, he, he, he, his, he, he, Mrs. Dursley, She, him, her, Mrs. Dursley, He, her, He, his, he, Mrs. Dursley, she, she, Mrs. Dursley, her,

In [None]:
coreferenced_text = doc._.coref_resolved

### Viewing the text, with highlighted named entities

In [6]:
displacy.render(doc, jupyter=True, style='ent')

That is cool, but it won't help us in this project, since the main characters are not named, and thus aren't noticed. We'd be better off building for this case something that looks for a DT NN* pattern in the POS tags, and then keeps it if it's referenced x number of times.

### spaCy extension with coreference resolution

Using the NeuralCoref library: https://github.com/huggingface/neuralcoref
(If I can ever get it to work, which so far I cannot.)

This needs to happen earlier, before the graph. Lots of unexpected pronoun confusion happening.

In [None]:
# Use that file to process the text into a doc.
nlp = spacy.load('en_core_web_md')
doc = nlp(coreferenced_text)

In [None]:
print(doc)

### Viewing dependencies

In [7]:
sentence_spans = list(doc.sents)
displacy.render(sentence_spans[1], jupyter=True, style='dep')

### Creating a Graph with a node for each noun phrase

In [8]:
# Establish our graph using Networkx
G = nx.Graph()

### Sorting prepositional phrases
This needs to happen along with the adjective removal before we've totally decimated the text.


In [10]:
last_span_end = 0
simplified_text = []

for chunk in doc.noun_chunks:
    G.add_node(chunk.root.text)
    for token in chunk:
        if (token.text != chunk.root.text) and (token.pos_ in {'ADJ', 'NOUN', 'PROPN'}):
            G.add_node(token.text)
            G.add_node(token.head.text)
            G.add_edge(token.text, token.head.text, label=token.pos_)
    this_span_start = chunk.start
    simplified_text.append("{0} {1}".format(doc[last_span_end:this_span_start].text,chunk.root.text))
    last_span_end = chunk.end
    
simplified_text.append(doc[last_span_end:].text)

In [11]:
simplified_text = " ".join(simplified_text)
simplified_text_filename = TEXT_FILENAME.split('.')[0] + "_simplified.txt"

with open(simplified_text_filename, 'w') as file:
    text = file.write(simplified_text.strip())

### Extracting Triples with ReVerb
From http://reverb.cs.washington.edu/ (and used in the paper "Information retrieval in folktales using natural
language processing": https://arxiv.org/pdf/1511.03012.pdf)

In [28]:
# Writing cleaned text to file to remove tabs prior to applying reverb.

chapter_cleaned_filename = TEXT_FILENAME.split('.')[0] + "_clean_chapter.txt"

with open(FILEPATH_TO_TEXTS + chapter_cleaned_filename, 'w') as file:
    text = file.write(chapters[1].strip())

# Preparing the filename for the tab-separated values output of ReVerb
tsv_filename = TEXT_FILENAME.split('.')[0] + "_ReVerb.tsv"

# Run the Java package. Java JDK is required to be installed from
# https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html
os.system("java -Xmx512m -jar reverb-latest.jar -a {0} > {1}".format(FILEPATH_TO_TEXTS + chapter_cleaned_filename, tsv_filename))

# Reading the results from the Java executable into a Pandas dataframe.
reverb_results = pd.read_csv(tsv_filename, header=None, sep='\t')
reverb_results.columns = ['filename','Sentence_Num','Arg1','Rel','Arg2','Arg1_StartInd','Arg1_EndInd','Rel_StartInd','Rel_EndInd', 'Arg2_StartInd','Arg2_EndInd', 'Confidence','Sent_Text', 'Sent_POS','Sent_ChunkTags', 'Arg1_Norm', 'Rel_Norm', 'Arg2_Norm']

# Set the option to not truncate the text in longer cells
pd.set_option('display.max_colwidth', -1)

ParserError: Error tokenizing data. C error: Expected 18 fields in line 335, saw 26


In [None]:
# Exploring the dataframe
reverb_results

### Graphing

#### Converting triples to graph nodes and edges

In [None]:
def create_nodes_and_edge(row):
    arg1_text = row['Arg1'].split(' ')
    arg1_pos = row['Sent_POS'].split(' ')[int(row['Arg1_StartInd']):int(row['Arg1_EndInd'])]
    print(arg1_text)
    print(arg1_pos)
    G.add_node(row['Arg1'])
    G.add_node(row['Arg2'])
    G.add_edge(row['Arg1'], row['Arg2'], label=row['Rel'])

In [None]:
_ = reverb_results.apply(lambda x: create_nodes_and_edge(x), axis=1)

#### Draw the graph

In [None]:
# Write our graph to DOT format to be read and visualized by GraphViz
nx.drawing.nx_pydot.write_dot(G,'graph_dot.txt')

graph_filename = TEXT_FILENAME.split('.')[0] + '_Graph'

# Load the saved DOT format
graph_visualized = Source.from_file('graph_dot.txt')

# Save it to a png
graph_visualized.render(filename=graph_filename, format='png')

# View it in the notebook
graph_visualized