# Character Network Analysis Jupyter Notebook
This notebook replicates the `characterNetwork-iterative.py` script originally developed by Ken Huang. It uses NLP, sentiment analysis, and graph theory to extract and visualize character relationships from a novel.

In [2]:
import codecs
import os
import spacy
import json
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from pathlib import Path
from afinn import Afinn
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

ModuleNotFoundError: No module named 'networkx'

## Load spaCy Model and Common Words List

In [None]:
nlp = spacy.load('en_core_web_sm')

def load_common_words(path):
    with open(path, 'r', encoding='utf-8') as f:
        words = json.load(f)
    return set(words)

## Load and Read Novel

In [None]:
def read_novel(book_name, path):
    book_list = [i for i in os.listdir(path) if book_name in i]
    novel = ''
    for i in book_list:
        with open(path / i, 'r', encoding='utf-8', errors='ignore') as f:
            data = f.read().replace('\r', ' ').replace('\n', ' ').replace("'", "'")
            novel += ' ' + data
    return novel

## Flatten Function

In [None]:
def flatten(input_list):
    flat_list = []
    for i in input_list:
        if isinstance(i, list):
            flat_list += flatten(i)
        else:
            flat_list += [i]
    return flat_list

## Named Entity Recognition

In [None]:
def name_entity_recognition(sentence, words):
    doc = nlp(sentence)
    name_entity = [x for x in doc.ents if x.label_ in ['PERSON', 'ORG']]
    name_entity = [str(x).lower().replace("'s", "") for x in name_entity]
    name_entity = flatten([x.split(' ') for x in name_entity])
    name_entity = [x for x in name_entity if len(x) >= 3 and x not in words]
    return name_entity

def iterative_NER(sentence_list, words, threshold_rate=0.0005):
    output = []
    for sentence in sentence_list:
        names = name_entity_recognition(sentence, words)
        if names:
            output.append(names)
    output = flatten(output)
    counts = Counter(output)
    return [x for x in counts if counts[x] >= threshold_rate * len(sentence_list)]

## Top Character Names

In [None]:
def top_names(name_list, novel, top_n=20):
    vect = CountVectorizer(vocabulary=name_list, stop_words='english')
    freq_matrix = vect.fit_transform([novel.lower()])
    freq_df = pd.DataFrame(freq_matrix.toarray(), columns=vect.get_feature_names_out()).T
    freq_df.columns = ['count']
    freq_df = freq_df.sort_values(by='count', ascending=False).head(top_n)
    return freq_df['count'].tolist(), freq_df.index.tolist()

## Sentiment Alignment Rate

In [None]:
def calculate_align_rate(sentence_list):
    afinn = Afinn()
    sentiment_score = [afinn.score(x) for x in sentence_list]
    nonzero = np.array(sentiment_score)[np.nonzero(sentiment_score)]
    return -2 * np.mean(nonzero)

## Compute Co-occurrence and Sentiment Matrices

In [None]:
def calculate_matrix(name_list, sentence_list, align_rate):
    afinn = Afinn()
    sentiment_score = np.array([afinn.score(x) for x in sentence_list])
    vectorizer = CountVectorizer(vocabulary=name_list, binary=True)
    occurrence_matrix = vectorizer.fit_transform(sentence_list).toarray()
    co_matrix = np.dot(occurrence_matrix.T, occurrence_matrix)
    sent_matrix = np.dot(occurrence_matrix.T, (occurrence_matrix.T * sentiment_score).T)
    sent_matrix += align_rate * co_matrix
    np.fill_diagonal(co_matrix, 0)
    np.fill_diagonal(sent_matrix, 0)
    co_matrix = np.tril(co_matrix)
    sent_matrix = np.tril(sent_matrix)
    return co_matrix, sent_matrix

## Generate Network Graph

In [None]:
def matrix_to_edge_list(matrix, mode, name_list):
    edge_list = []
    shape = matrix.shape[0]
    normalized = matrix / np.max(np.abs(matrix)) if np.max(np.abs(matrix)) > 0 else matrix
    for i in range(shape):
        for j in range(i):
            weight = np.log(2000 * normalized[i, j] + 1) * 0.7 if mode == 'co-occurrence' else np.log(abs(1000 * normalized[i, j]) + 1) * 0.7
            color = 2000 * normalized[i, j] if mode == 'sentiment' else np.log(2000 * normalized[i, j] + 1)
            edge_list.append((name_list[i], name_list[j], {'weight': weight, 'color': color}))
    return edge_list

def plot_graph(name_list, name_frequency, matrix, title, mode, path=''):
    edge_list = matrix_to_edge_list(matrix, mode, name_list)
    norm_freq = np.array(name_frequency) / np.max(name_frequency)
    G = nx.Graph()
    G.add_nodes_from(name_list)
    G.add_edges_from(edge_list)
    pos = nx.circular_layout(G)
    plt.figure(figsize=(14, 14))
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    colors = [G[u][v]['color'] for u, v in edges]
    nx.draw(G, pos, node_color='#A0CBE2', node_size=np.sqrt(norm_freq) * 4000, 
            edge_color=colors, width=weights, edge_cmap=plt.cm.coolwarm, 
            with_labels=True, font_size=10)
    plt.title(title)
    plt.savefig(Path(path) / f"{title}.png")
    plt.show()