# Install dependencies

Do it only for Google Colab. Otherwise, the dependencies will be installed in the local machine.

In [None]:
# Clone the repo and move into it
!git clone https://github.com/ddeepak95/social-network-analysis-novels.git
%cd social-network-analysis-novels
%pip install ebooklib
%pip install networkx
%pip install matplotlib
%pip install numpy
%pip install beautifulsoup4
%pip install html2text
%pip install spacy
%pip install textblob
%pip install wordcloud
%pip install tabulate
%pip install python-louvain
%pip install pandas
!python -m spacy download en_core_web_trf

# Setting up
Upload the book epub file inside the `social-network-analysis-novels` directory and update the `epub_path` variable with the correct filename.

In [1]:
# Enter book name here
epub_path = "./book.epub"

# Import dependencies

In [3]:
import os
import json
import spacy
from utils.epub_functions import create_flattened_book_json, html_to_markdown
from utils.nlp_functions import extract_character_names_with_variations, character_counter, plot_character_wordcloud, bar_plot_character_counts, extract_character_interactions
from utils.network_functions import plot_character_network_with_layout, plot_ego_network, summarize_character_network

# Extract contents from Epub

In [None]:
working_folder = "data"
os.makedirs(working_folder, exist_ok=True)

output_json = os.path.join(working_folder, "book_content.json")
flattened_json_path = create_flattened_book_json(epub_path, output_json)
print(f"Created flattened book content JSON at: {flattened_json_path}")

# Verify the book content and the titles of the sections in the book file

In [None]:
# See the titles of the sections in the book file

with open(flattened_json_path, 'r', encoding='utf-8') as f:
    book_content = json.load(f)

for i, section in enumerate(book_content):
    print(f"Index {i}: {section['title']}")


## Extract character names with variations

In [6]:
# set the start chapter and end chapter based on the chapter index printed above

start_chapter = 7
end_chapter = 90
character_names_loc = f"{working_folder}/character_names.json"

In [None]:
# IMPORTANT: If the character names json file already exists, it will be overwritten. Don't run this cell if the file already exists.

if os.path.exists(character_names_loc):
    response = input(f"File {character_names_loc} already exists. Overwrite? (y/n): ")
    if response.lower() == 'y':
        extract_character_names_with_variations(book_content, character_names_loc, start_chapter, end_chapter)
        print(f"Overwrote {character_names_loc}")
    else:
        print("Skipping extraction - using existing file")
else:
    extract_character_names_with_variations(book_content, character_names_loc, start_chapter, end_chapter)
    print(f"Created {character_names_loc}")

# IMPORTANT: Verify the character names json file to ensure that the character names are correct.
After extracting character names, edit the character names json file to add variations for each character. This is important for correct social network creation. The character names file can be found inside the data folder.

# Start Analysis

In [7]:
full_book_text = ""
for chapter in book_content[start_chapter:end_chapter+1]:
    full_book_text += html_to_markdown(chapter["content"])



# Character Counter

In [8]:
character_counter_loc = f"{working_folder}/character_counter.json"

In [None]:
with open(character_names_loc, "r", encoding="utf-8") as f:
    character_names = json.load(f)

character_counter(full_book_text, character_names, character_counter_loc)

In [None]:
character_counter_data = json.load(open(character_counter_loc, "r", encoding="utf-8"))
plot_character_wordcloud(character_counter_data, scale_type='linear', top_n=20)


In [None]:
bar_plot_character_counts(character_counter_data)

# Generate Character Interactions

Character Interactions data is needed for generating the social networks of the characters. We will use spacy to generate the character interactions based on the cooccurence of the character names in a line or a paragraph (based on our preference).

In [10]:
character_interactions_loc = f"{working_folder}/character_interactions.json"

In [None]:
extract_character_interactions(
    character_variations_path=character_names_loc,
    book_content=full_book_text,
    output_path=character_interactions_loc,
    granularity="line"  # or "para" for extracting interactions based on paragraphs
)

# Network Analysis

In [16]:
# Set thresholds for filtering the character co-occurrence network:
# - min_cooccurrence: minimum number of times two characters must appear together to be considered connected
# - min_character_degree: minimum number of connections (edges) a character must have to be included in the analysis

min_cooccurrence = 3
min_character_degree = 2


## Overall Character Social Network Plot

In [None]:
# read interactions data
interactions_data = json.load(open(character_interactions_loc, "r", encoding="utf-8"))
# plot character network
plot_character_network_with_layout(interactions_data, min_character_degree=min_character_degree, min_cooccurrence=min_cooccurrence, focus_top_n=True, top_n_labels=25)

## Social Network Plot Specific to a particular character

In [None]:
plot_ego_network(interactions_data, "Digby Kilgour", label_size=5, min_cooccurrence=min_cooccurrence, min_character_degree=min_character_degree)

## Detailed Social Network Analysis Information pertaining to Centrality

In [None]:
summarize_character_network(interactions_data, min_cooccurrence=min_cooccurrence, min_character_degree=min_character_degree)