![image alt ><](https://i1.sndcdn.com/artworks-000058062617-qqgpfp-t500x500.jpg)

Incrição no Anel:

![Picture](https://upload.wikimedia.org/wikipedia/commons/thumb/2/23/One_Ring_inscription.svg/300px-One_Ring_inscription.svg.png)

Em Inglês:

One ring to rule them all,
   one ring to find them,
One ring to bring them all
   and in the darkness bind them.

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# carregando as bibliotecas exigidas

import voila
import csv
from colour import Color
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
import codecs
import requests
import os
import spacy
import json
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import matplotlib as plt
from tqdm import tqdm
from pathlib import Path
from afinn import Afinn
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display
from matplotlib.colors import ListedColormap, LinearSegmentedColormap, Colormap


In [None]:
def flatten(input_list):
    '''
    A function to flatten complex list.
    :param input_list: The list to be flatten
    :return: the flattened list.
    '''

    flat_list = []
    for i in input_list:
        if type(i) == list:
            flat_list += flatten(i)
        else:
            flat_list += [i]

    return flat_list


In [None]:
def common_words():
    '''
    Lê as common words de um arquivo externo 
    :path: Onde as common words estão armazenadas.
    :return: Um conjunto das palavras comuns.
    '''
    path = "https://raw.githubusercontent.com/chicodias/character-network/master/common_words.txt"
    response = requests.get(path)
    words = response.text
    words = json.loads(words)

    return set(words)

In [None]:
# Lendo os livros
def read_novel(novel_list):
    '''
    Função que lê os livros de um dado url
    :return: o texto do livro.
      
    '''

    progresso = widgets.IntProgress(
    value=0,
    min=0,
    max=3,
    step=1,
    description='Carregando: ',
    bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
    )
    display(progresso)

    book_list = novel_list
    book_list = [i for i in book_list]
    novel = ''
    for i in book_list:
        response = requests.get(i)
        words = response.text
        data = words.replace('\r', ' ').replace('\n', ' ').replace("\'", "'")
        novel += ' ' + data
        progresso.value += 1

    return novel


In [None]:
def name_entity_recognition(sentence):
    '''
    Uma função que recupera as entidades em uma frase.
    :param sentence: a frase para pegar os nomes.
    :return: uma lista de entidades mencionadas na frase
    '''
    
    doc = nlp(sentence)
    # retrieve person and organization's name from the sentence
    name_entity = [x for x in doc.ents if x.label_ in ['PERSON', 'ORG']]
    # convert all names to lowercase and remove 's in names
    name_entity = [str(x).lower().replace("'s","") for x in name_entity]
    # split names into single words ('Harry Potter' -> ['Harry', 'Potter'])
    name_entity = [x.split(' ') for x in name_entity]
    # flatten the name list
    name_entity = flatten(name_entity)
    # remove name words that are less than 3 letters to raise recognition accuracy
    name_entity = [x for x in name_entity if len(x) >= 3]
    # remove name words that are in the set of 4000 common words
    name_entity = [x for x in name_entity if x not in words]

    return name_entity


In [None]:

def iterative_NER(sentence_list, threshold_rate=0.0005):
    '''
    Uma função para executar o NER iterativamente. O propósito dessa função é reconhecer
    todos os nomes importantes enquanto reduz os erros de reconhecimento.
    :param sentence_list: lista de frases do livro
    :param threshold_rate: o limiar de frequencia por frase; podemos remover uma palavra
    devido a erros de reconhecimento
    :return: conjunto de nomes na trama.
    '''

    progresso = widgets.IntProgress(
    value=0,
    min=0,
    max=1000,
    step=1,
    description='NER: ',
    bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
    )
    display(progresso)


    output = []
    for i in sentence_list:
        progresso.value += 1
        name_list = name_entity_recognition(i)
        if name_list != []:
            output.append(name_list)
    output = flatten(output)
    print('\nNER Calculado;')

    from collections import Counter

    output = Counter(output)
    output = [x for x in output if output[x] >= threshold_rate * len(sentence_list)]
    
    return output


In [None]:
def top_names(name_list, novel, top_num=25):
    '''
    A function to return the top names in a novel and their frequencies.
    :param name_list: the non-duplicate list of names of a novel.
    :param novel: the novel text.
    :param top_num: the number of names the function finally output.
    :return: the list of top names and the list of top names' frequency.
    '''

    vect = CountVectorizer(vocabulary=name_list, stop_words='english')
    name_frequency = vect.fit_transform([novel.lower()])
    name_frequency = pd.DataFrame(name_frequency.toarray(), columns=vect.get_feature_names())
    name_frequency = name_frequency.T
    name_frequency = name_frequency.sort_values(by=0, ascending=False)
    name_frequency = name_frequency[0:top_num]
    names = list(name_frequency.index)
    name_frequency = list(name_frequency[0])

    return name_frequency, names


In [None]:

def calculate_align_rate(sentence_list):
    '''
    Função que calcula a taxa de alinhamento sentimental do livro inteiro
    :param sentence_list: a lista de frases do livro.
    :return: a taxa de alinhamento do livro.
    '''
    progresso = widgets.IntProgress(
    value=0,
    min=0,
    max=1000,
    step=1,
    description='Alinhamento:',
    bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
    )
    display(progresso)

    afinn = Afinn()
    sentiment_score = []
    for x in sentence_list:
      sentiment_score.append(afinn.score(x))
      progresso.value += 1
    align_rate = np.sum(sentiment_score)/len(np.nonzero(sentiment_score)[0]) * -2

    return align_rate


In [None]:
def calculate_matrix(name_list, sentence_list, align_rate):
    '''
    Função que calcula a matriz de co-ocorrência e a matriz de sentimento entre os personagens mais relevantes 
    :param name_list: lista com os nomes dos maiores personagens
    :param sentence_list: lista com as frases no livro.
    :param align_rate: taxa de alinhamento sentimental
    :return: a matriz de co-occurrencia e a de sentimento.
    '''

    progresso = widgets.IntProgress(
    value=0,
    min=0,
    max=1000,
    step=1,
    description='Sentimentos',
    bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
    )
    display(progresso)


    # calculate a sentiment score for each sentence in the novel
    afinn = Afinn()
    sentiment_score = []
    for x in sentence_list:
      sentiment_score.append(afinn.score(x))
      progresso.value += 1

    # calculate occurrence matrix and sentiment matrix among the top characters
    name_vect = CountVectorizer(vocabulary=name_list, binary=True)
    occurrence_each_sentence = name_vect.fit_transform(sentence_list).toarray()
    cooccurrence_matrix = np.dot(occurrence_each_sentence.T, occurrence_each_sentence)
    sentiment_matrix = np.dot(occurrence_each_sentence.T, (occurrence_each_sentence.T * sentiment_score).T)
    sentiment_matrix += align_rate * cooccurrence_matrix
    cooccurrence_matrix = np.tril(cooccurrence_matrix)
    sentiment_matrix = np.tril(sentiment_matrix)
    # diagonals of the matrices are set to be 0 (co-occurrence of name itself is meaningless)
    shape = cooccurrence_matrix.shape[0]
    cooccurrence_matrix[[range(shape)], [range(shape)]] = 0
    sentiment_matrix[[range(shape)], [range(shape)]] = 0

    return cooccurrence_matrix, sentiment_matrix

In [None]:
def matrix_to_edge_list(matrix, mode, name_list):
    '''
    Function to convert matrix (co-occurrence/sentiment) to edge list of the network graph. It determines the
    weight and color of the edges in the network graph.
    :param matrix: co-occurrence matrix or sentiment matrix.
    :param mode: 'co-occurrence' or 'sentiment'
    :param name_list: the list of names of the top characters in the novel.
    :return: the edge list with weight and color param.
    '''
    edge_list = []
    shape = matrix.shape[0]
    lower_tri_loc = list(zip(*np.where(np.triu(np.ones([shape, shape])) == 0)))
    normalized_matrix = matrix / np.max(np.abs(matrix))
    if mode == 'co-occurrence':
        weight = np.log(2000 * normalized_matrix + 1) * 0.7
        color = np.log(2000 * normalized_matrix + 1)/8
    if mode == 'sentiment':
        weight = np.log(np.abs(1000 * normalized_matrix) + 1) * 0.7
        color = 2000 * normalized_matrix/8
    for i in lower_tri_loc:
        edge_list.append((name_list[i[0]], name_list[i[1]], {'weight': weight[i], 'color': color[i]}))

    return edge_list




def plot_graph(name_list, name_frequency, matrix, plt_name, mode):
    '''
    Function to plotly the network graph (co-occurrence network or sentiment network).
    :param name_list: the list of top character names in the novel.
    :param name_frequency: the list containing the frequencies of the top names.
    :param matrix: co-occurrence matrix or sentiment matrix.
    :param plt_name: the name of the plot (PNG file) to output.
    :param mode: 'co-occurrence' or 'sentiment'
    
    '''

    label = {i: i for i in name_list}
    edge_list = matrix_to_edge_list(matrix, mode, name_list)
    normalized_frequency = np.array(name_frequency) / np.max(name_frequency)



    G = nx.Graph()
    G.add_nodes_from(name_list)
    G.add_edges_from(edge_list)
    pos = nx.circular_layout(G) # da pra alterar o layout do grafo aqui
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    colors = [G[u][v]['color'] for u, v in edges]

    a = [G[u][v]['color'] for u, v in edges]
    mm = min(a)
    mx = max(a)

    traceRecode = []  # contains edge_trace, node_trace, middle_node_trace
    ##############################################################################
    if mode == 'co-occurrence':
      colors = plt.cm.get_cmap('Blues')
      #list(Color('darkblue').range_to(Color('lightblue'), len(G.edges())))
    elif mode == 'sentiment':
      colors = colors = plt.cm.get_cmap('viridis')
      #list(Color('yellow').range_to(Color('purple'), len(G.edges())))
    #colors = ['rgb' + str(x.rgb) for x in colors]



    index = 0
    for u,v in G.edges:
      x0, y0 = pos[u]
      x1, y1 = pos[v]
      weight =  G[u][v]['weight']
      #text = str(u) + '-' + str(v)
      col = G[u][v]['color']
      pal = 'rgb' + str(plt.colors.colorConverter.to_rgb(colors(col)))
      trace = go.Scatter(x=tuple([x0, x1]), y=tuple([y0, y1]),
                          mode='lines',
                          line={'width': weight},
                          marker=dict(color=pal),
                          line_shape='spline',
                          hoverinfo = "text",
                          text = str(u) + '-' + str(v),
                          opacity=1)
      traceRecode.append(trace)
      index = index + 1

    node_trace = go.Scatter(x=[], y=[], hovertext=[], text=[], mode='markers+text', textposition="bottom center",
                            hoverinfo="text", marker={'size': np.sqrt(normalized_frequency) * 60, 'color': 'SkyBlue'})

    index = 0
    for node in G.nodes():
        x, y = pos[node]
        hovertext = "Entity: " + str(node)
        text = node
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['hovertext'] += tuple([hovertext])
        node_trace['text'] += tuple([text])
        index = index + 1

    traceRecode.append(node_trace)

    #plot
    figure = go.Figure(
            data = traceRecode,
            layout = go.Layout(title=plt_name, showlegend=True,
                                margin={'b': 40, 'l': 40, 'r': 40, 't': 40},
                                xaxis={'showgrid': False, 'zeroline': False, 'showticklabels': False},
                                yaxis={'showgrid': False, 'zeroline': False, 'showticklabels': False},
                                height=600
                                )
            )

    figure.show()

### Favor Aguardar a análise dos livros.



In [None]:
nlp = spacy.load('en_core_web_sm')
novel_list = ['https://github.com/chicodias/tolkien/raw/master/datasets/01%20-%20The%20Fellowship%20Of%20The%20Ring.txt', 
              'https://github.com/chicodias/tolkien/raw/master/datasets/02%20-%20The%20Two%20Towers.txt',
              'https://github.com/chicodias/tolkien/raw/master/datasets/03%20-%20The%20Return%20Of%20The%20King.txt']

## carrega as common words
words = common_words()

## le o livro
novel = read_novel(novel_list)
print('Livros Carregados;')

## carrega o livro em uma lista de sentenças e tokeniza
sentence_list = sent_tokenize(novel)
print('\nLivros Tokenizados;')

# calcula a taxa de alinhamento
align_rate = -0.13462798594150235
print('\nTaxa de Alinhamento Calculada;')

#lista preliminar de nomes
# preliminary_name_list = iterative_NER(sentence_list)

preliminary_name_list = pd.read_csv('preliminary_name_list.csv', header=None).T[0].values

IntProgress(value=0, bar_style='success', description='Carregando: ', max=3)

Livros Carregados;

Livros Tokenizados;

Taxa de Alinhamento Calculada;


In [None]:
# lista definitiva

name_frequency, name_list = top_names(preliminary_name_list, novel, 20)
print('\nLista Final pronta;')

print('\nConcluído!')


Lista Final pronta;

Concluído!


In [None]:
# Bloco utilizado apenas para gerar os arquivos ".csv"
# Não descomentar
# align_rate = -0.13462798594150235
# with open("preliminary_name_list.csv","w") as csv_file:
#   wr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
#    wr.writerow(preliminary_name_list)

### Selecione o Livro:

In [None]:
# cria as análises conforme os livros selecionados
def create_graph(select_book):
  name = "LoR"

  if select_book == 5: # indice 5, exibe todos os livros (Separadamente)
    livro = 1
    for book in novel_list:
        novel = read_novel([book])
        sent_list = sent_tokenize(novel)
        cooccurrence_matrix, sentiment_matrix = calculate_matrix(name_list, sent_list, align_rate)
        plot_graph(name_list, name_frequency, sentiment_matrix, name + ' Grafo de Sentimentos - Livro ' + str(livro), 'sentiment')
        plot_graph(name_list, name_frequency, cooccurrence_matrix, name + ' Grafo de Co-ocorrência - Livro ' + str(livro) , 'co-occurrence')
        livro += 1

  elif select_book == 4: # caso indice 4, exibe todos os livros, que foi calculo anteriormente
    cooccurrence_matrix, sentiment_matrix = calculate_matrix(name_list, sentence_list, align_rate)
    plot_graph(name_list, name_frequency, sentiment_matrix, name + ' Grafo de Sentimentos - Trama Inteira', 'sentiment')
    plot_graph(name_list, name_frequency, cooccurrence_matrix, name + ' Grafo de Co-ocorrência - Trama Inteira', 'co-occurrence')

  else: # indices 0, 1 ou 2, exibe apenas o livro selecionado
    novel = read_novel([novel_list[select_book]])
    sent_list = sent_tokenize(novel)
    cooccurrence_matrix, sentiment_matrix = calculate_matrix(name_list, sent_list, align_rate)
    plot_graph(name_list, name_frequency, sentiment_matrix, name + ' Grafo de Sentimentos - Livro ' + str(select_book+1), 'sentiment')
    plot_graph(name_list, name_frequency, cooccurrence_matrix, name + ' Grafo de Co-ocorrência - Livro ' + str(select_book+1), 'co-occurrence')


In [None]:
selected_book = [('The Fellowship of the Ring', 0), 
                ('The Two Towers', 1),
                ('The Return of the King', 2),
                ('Todos os Livros (Trama Inteira)', 4),
                ('Todos os Livros (Separadamente)', 5)]

In [None]:
interact(create_graph, select_book=selected_book)

interactive(children=(Dropdown(description='select_book', options=(('The Fellowship of the Ring', 0), ('The Tw…

<function __main__.create_graph>