In [2]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

import powerlaw
import re
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


For building the network, the relationships between the characters will be used, as created in the data building notebook. Moreover, in order to identify and keep only the most important characters, the characters dataframe is also loaded.

In [3]:
# Define the locations of the dataframes to be used in the analysis, ie. characters' names and relationships
tot_relationships_url = 'https://raw.githubusercontent.com/gpanagioto/projects_socialgraphs22/main/Project2/Relationships/total_relationships.csv'
characters_df_url = 'https://raw.githubusercontent.com/gpanagioto/projects_socialgraphs22/main/Project2/characters_df.csv'

# Load the dataframes
relationships_pd = pd.read_csv(tot_relationships_url, index_col=0)
characters_df = pd.read_csv(characters_df_url)

Now we can create a network by [using the relationship dataframe as edgelist](https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html) and the weight as an edge attribute to show relationship strength:

In [4]:
G = nx.from_pandas_edgelist(relationships_pd,
                           source='source',
                           target='target',
                           edge_attr='weight',
                           create_using=nx.Graph())

Initial network statistics, number of nodes, edges, etc:

In [5]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 2728
Number of edges: 17324


### Connected component  

Since the graph is undirected, we use the function [`connected_components`](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html)

In [6]:
Gcc = list( nx.connected_components(G))
Gcc = sorted(Gcc, key=len, reverse=True)
print(f"Number of connected components: {len(Gcc)}")

Number of connected components: 19


A number of connected components exist. Their size in regards to nodes is the following:

In [7]:
for i,  comp in enumerate(Gcc):
    print(f"Component {i}: {len(comp)}")    

Component 0: 2685
Component 1: 5
Component 2: 4
Component 3: 3
Component 4: 3
Component 5: 2
Component 6: 2
Component 7: 2
Component 8: 2
Component 9: 2
Component 10: 2
Component 11: 2
Component 12: 2
Component 13: 2
Component 14: 2
Component 15: 2
Component 16: 2
Component 17: 2
Component 18: 2


Since only the first component holds the vast majority of nodes, this is the network that is kept.

In [8]:
G = G.subgraph(Gcc[0])

### Centrality measures to decrease the number of nodes to the most important ones

This is a relatively large network, especially when trying to analyze interactions between characters. Not all of them are central to the plot. Some might be ancillary to the plot, or recurring. Some others might only be present to a couple or one episode. Obviously the latter are not valuable in the following analysis, since they are also bound to have low text content in their dialogues. Finally, there is the case of aggregated characters, like `Man`, `Woman`, `Officer` etc. While these generic names could have been removed during the character dataframe building phase, they are included to avoid biasing the analysis with prior knowledge of the show.

In [9]:
eigen_centr = nx.eigenvector_centrality(G)
closeness_centr = nx.closeness_centrality(G)
degree_centr = nx.degree_centrality(G)
betweenness = nx.betweenness_centrality(G)

In [None]:
data = {'name': list(betweenness.keys()),
        'betweenness':list(betweenness.values()),
        'eigenvector':list(eigen_centr.values()),
        'degree': list(degree_centr.values()),
        'closeness':list(closeness_centr.values()),
       }

centralities_df = pd.DataFrame.from_dict(data)

The centrality measure used to remove some characters is betweenness centrality. Eigenvector centrality can favor cahracters not as important, but that have connected to major characters. First, the characters with betweenness centrality equal to $0$ are removed:

In [None]:
characters_reduced_df = centralities_df.loc[centralities_df.betweenness > 0]

Afterwards, the characters can be fiurther reduced by using selecting charactres that belong to the upper percentiles of the same centrality measure.

In [None]:
percentile = 50 # Percentile over which the characters are selected

cols = characters_reduced_df.columns
centralities_perc =  np.percentile( characters_reduced_df[cols[1]], percentile)

In [None]:
hist, edges = np.histogram( np.array(characters_reduced_df.betweenness.to_list()), bins = 200 )

plt.bar(edges[:-1], hist, width=0.0005)
plt.plot( [centralities_perc, centralities_perc], [0, hist.max()], 'r--' )
plt.yscale('log')
plt.title(f'Betweenness centrality distribution\n for positive betweenness values\n{percentile}\'th percentile: {np.round(centralities_perc, 5)}')
plt.show()

In [None]:
reduced_centralities_df = characters_reduced_df[ (characters_reduced_df[cols[1]] > centralities_perc)]

In [None]:
reduced_centralities_df.sort_values('betweenness', ascending=False)

Now save the reduced characteres list for further use:

In [None]:
reduced_centralities_df.reset_index(inplace=True, drop=True)
reduced_centralities_df[['name']].to_csv('reduced_characters_df.csv', index=False)

### With the reduced characters, build the relationships again:

In [None]:
relationships_path = Path.cwd() / 'Relationships'
scripts_dir = Path.cwd() / 'Scripts'

In [None]:
names = reduced_centralities_df.name.to_list()

In [None]:
characters_interactions = {}

In [None]:
def get_characters_in_text(text, characters_list):

    pattern = r'\n(.+)\s:'
    matches = re.findall(pattern, text)
    chars = [nm.strip() for nm in matches]
    
    char_list = []
    for character in chars:
        if character in characters_list:
            char_list.append(character)
    
    return char_list

In [None]:
def create_relationship_dict(char_list):
    relationship_dict_list = []
    for i, el in enumerate(char_list[:-1]):
        for character in char_list[i+1:]:
            if not character == el:
                relationship_dict_list.append({ 'source':el, 'target':character })
    return relationship_dict_list

In [None]:
# Define a regex pattern. Compile it to be faster since there are many files
pattern = r"[+]{2}\n([^+]+)[+]{2}"
prog = re.compile(pattern)

total_relationships_dict_list = []

for file_ in tqdm(scripts_dir.glob('**/*.txt')):
        
    episode_relationship_dict_list = []
    season_nr = file_.as_posix().split('/')[-2]
    fname = file_.as_posix().split('/')[-1] 
    fname = fname.split('.')[0]
    
    # Create a folder for each season and save the csv of the relationships in there for each episode
    season_path = relationships_path / f"{season_nr}"
    season_path.mkdir(exist_ok=True)
    
    with open(file_, 'r', encoding='utf-8') as f:
        test_txt = f.read()
            
    
    # Use regex to find the text between the pluses
    results = prog.findall(test_txt)
    for result in results:
        # Get the list of characters in this scene
        chars_in_part = get_characters_in_text(result, names)
        # If there are more than 1 characters in the list, create a relationship between them and
        # append to the corresponding lists

        if len(chars_in_part)>1:
            rel_lst = create_relationship_dict(chars_in_part)                
            episode_relationship_dict_list += rel_lst
            total_relationships_dict_list += rel_lst
    
    # For this episode, create now a dataframe from the episode relationships
    episode_rel_df = pd.DataFrame(episode_relationship_dict_list)
    
    # I have duplicates. I can add them as weights.
    # But first to have all of the same names on the same column
    # I want for a specific pair of source and target
    # the source to be always on the same column of the dataframe
    episode_rel_df = pd.DataFrame( np.sort(episode_rel_df.values, axis=1), columns=episode_rel_df.columns )
    
    # For the duplicates, we can add them up to form weights on the edges, representing
    # how strong the relationship is
    episode_rel_df['weight'] = 1 # initialize
    try:
        episode_rel_df = episode_rel_df.groupby(['source', 'target'], sort=False, as_index=False).sum()
        episode_rel_df.to_csv(season_path.as_posix()+'/'+fname+'_reduced'+'.csv')
    except KeyError as e:
        print(e)
        print(season_nr, fname)
        print(episode_rel_df)
        print()
        
        
# Do the same for the total relationship
total_relationships_dict_list = pd.DataFrame(total_relationships_dict_list)
total_relationships_dict_list = pd.DataFrame( np.sort(total_relationships_dict_list.values, axis=1), columns=total_relationships_dict_list.columns )
total_relationships_dict_list['weight'] = 1
total_relationships_dict_list = total_relationships_dict_list.groupby(['source', 'target'], sort=False, as_index=False).sum()
total_relationships_dict_list.to_csv(relationships_path.as_posix()+'/'+'total_relationships_reduced'+'.csv')

### Build total scripts for each of the reduced characters

In [8]:
characters_reduced_texts_path = Path.cwd() / 'Character_Texts' / 'Characters'
characters_reduced_texts_path.mkdir(parents=True, exist_ok=True)

In [9]:
texts_path = Path.cwd() / 'Texts'

In [13]:
char_txt_dict = {}
for file in tqdm(texts_path.glob('*/*/*.txt')):
    char_name = file.stem
    
    with open(file.as_posix(), 'r', encoding='utf-8') as f:
        doc = f.read()
    
    char_txt_dict[char_name] = char_txt_dict.get(char_name, '') + doc + '\n'
    
for character in tqdm(list(char_txt_dict.keys())):
    with open(characters_reduced_texts_path.as_posix()+f"/{character}.txt", 'w', encoding='utf-8') as f:
        f.write(char_txt_dict[character])
    

0it [00:00, ?it/s]

  0%|          | 0/3033 [00:00<?, ?it/s]