In [143]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import pickle

from tqdm.notebook import tqdm

The idea is to create an undirected network from the characters when they interact in the same scene. The scene is separated by 5 dashes in the scripts text files. Undirected because since they partake in the scene and conversation, there is no sense of direction. Like a relationship.

The idea is to crawl all folders in the Scripts dir. In each season, open each script txt file. Then read the file. Find the sections between the 5 dashes. Now the more difficult part: create a pandas dataframe that logs these interactions, preferrably without themselves. Can be done eg. by creating a list of identified characters.

In [26]:
# Load the characters data frame from wikipedia, which also includes the first names of the characters
try:
    with open('wiki_characters_df.pickle', 'rb') as f:
        characters_df = pickle.load(f)
except FileNotFoundError as e:
    print(e)
    print('Run notebook 1_wikipedia_south_park_characters.ipynb')

Create also a directory for the relationships to be saved in:

In [126]:
rel_path = Path.cwd() / 'Relationships'

if not rel_path.is_dir():
    Path.mkdir(rel_path)

We are only interested in the names of the characters here. So we can create a dictionary with keys the full name and values the first name. Or maybe we could create two lists:

In [27]:
# Go with the two lists approach
full_names = characters_df.index.to_list()
first_names = characters_df['first_name'].to_list()

In [28]:
# Create a dictionary to create a data frame from it afterwards
# It will be of the form:   'first_char': 'second_char'
characters_interactions = {} 

In [128]:
def make_dir(dir_path):
    if not dir_path.is_dir():
        os.mkdir(dir_path)
        print(f'Created: {dir_path.as_posix()}')
    else:
        print(f"{dir_path.as_posix()} exists")

In [70]:
def get_characters_in_text(text, characters_list):
    txt_list_tmp = [ el.strip() for el in text.split('\n') if el ]
    char_list = []
    for character in characters_list:
        if character in txt_list_tmp:
            char_list.append(character)
    return char_list

In [133]:
def create_relationship_dict(char_list):
    relationship_dict_list = []
    for i, el in enumerate(char_list[:-1]):
        for character in char_list[i+1:]:
            relationship_dict_list.append({ 'source':el, 'target':character })
    return relationship_dict_list

In [154]:
# Get the directory where the scripts are located
scripts_dir = Path.cwd() / 'Scripts'

# Define a regex pattern. Maybe compile it to be faster since there are many files
pattern = r"[+]{2}\n([^+]+)[+]{2}"
prog = re.compile(pattern)

total_relationships_dict_list = []


for file_ in tqdm(Path(scripts_dir).glob('*/*.txt')):
        
    episode_relationship_dict_list = []
    season_nr = file_.as_posix().split('/')[-2]
    fname = file_.as_posix().split('/')[-1]
    fname = fname.split('.')[0]
    
    # Create a folder for each season and save the csv of the relationships in there for each episode
    season_path = rel_path / f"{season_nr}"
    make_dir(season_path)
    
    with open(file_, 'r', encoding='utf-8') as f:
        test_txt = f.read()

    # Use regex to find the text between the pluses
    results = prog.findall(test_txt)
    for result in results:
        # Get the list of characters in this scene
        chars_in_part = get_characters_in_text(result, first_names)

        # If there are more than 1 characters in the list, create a relationship between them and
        # append to the corresponding lists

        if len(chars_in_part)>1:
            rel_lst = create_relationship_dict(chars_in_part)                
            episode_relationship_dict_list += rel_lst
            total_relationships_dict_list += rel_lst
    
    # For this episode, create now a dataframe from the episode relationships
    episode_rel_df = pd.DataFrame(episode_relationship_dict_list)
    
    # I have duplicates. I can add them as weights.
    # But first to have all of the same names on the same column
    # I want for a specific pair of source and target
    # the source to be always on the same column of the dataframe
    episode_rel_df = pd.DataFrame( np.sort(episode_rel_df.values, axis=1), columns=episode_rel_df.columns )
    
    # For the duplicates, we can add them up to form weights on the edges, representing
    # how strong the relationship is
    episode_rel_df['weight'] = 1 # initialize
    try:
        episode_rel_df = episode_rel_df.groupby(['source', 'target'], sort=False, as_index=False).sum()
        episode_rel_df.to_csv(season_path.as_posix()+'/'+fname+'.csv')
    except KeyError as e:
        print(e)
        print(season_nr, fname)
        print(episode_rel_df)
        
        
# Do the same for the total relationship
total_relationships_dict_list = pd.DataFrame(total_relationships_dict_list)
total_relationships_dict_list = pd.DataFrame( np.sort(total_relationships_dict_list.values, axis=1), columns=total_relationships_dict_list.columns )
total_relationships_dict_list['weight'] = 1
total_relationships_dict_list = total_relationships_dict_list.groupby(['source', 'target'], sort=False, as_index=False).sum()
total_relationships_dict_list.to_csv(season_path.as_posix()+'total_relationships'+'.csv')

0it [00:00, ?it/s]

Created: C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/01 exists
C:/Users/user/Document

C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/07 exists
C:/Users/user/Documents/

C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/13 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/13 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/13 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/13 exists
Created: C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/14
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/14 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/14 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/14 exists
C:/Users/user/Document

C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/19 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/19 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/19 exists
Created: C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/20
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/20 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/20 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/20 exists
C:/Users/user/Documents/DTU/Fall_22_23/Social graphs/g_panag_repo/projects_socialgraphs22/Project/Relationships/20 exists
C:/Users/user/Document

In [155]:
total_relationships_dict_list

Unnamed: 0,source,target,weight
0,Kyle,Stan,1980
1,Cartman,Stan,1659
2,Ike,Stan,23
3,Cartman,Kyle,1782
4,Ike,Kyle,92
...,...,...,...
379,Kyle,Mackey,2
380,Sheila,Shelly,1
381,Jason,Jimbo,1
382,Jason,Stephen,1
