In [25]:
import pandas as pd
import os
import json

In [6]:
characters = {
    "Takumi": ["takumi"],
    "Bunta": ["bunta"],
    "Ryosuke": ["ryosuke"],
    "Natsuki": ["natsuki", "mogi"],
    "Iketani": ["iketani"],
    "Keisuke": ["keisuke"],
    "Mako": ["mako", "sato"],
    "Itsuki": ["itsuki"],
    "Takeshi": ["takeshi", "nakazato"],
    "Kenji": ["kenji"]

}

In [2]:
def clean_word(x: str):
    """Strip unwanted characters from .ass files

    Args:
        x (str): word from ass file

    Returns:
        str: cleaned word
    """
    x = x.lower()
    remove = ["{\i1}", "{\i0}", "=--", ".", "'s", "!", ",", "?", '"', "'", "=", ":"
    "{\\an5\\pos(720107)}", "(", ")", "{\\an5\\pos960175}", "{\\an5\\pos960107}", "--"]
    for i in remove:
        x = x.replace(i, "")
    x = x.replace("\n", " ")
    x = x.replace("\r", " ")
   
    x = x.strip()
    return x

In [4]:
# Obtain all files from subtitles folder and sub-folders
subtitles_dir = list(os.walk("subtitles"))[0][1]
subtitles = list(os.walk("subtitles"))[1:]

In [5]:
words = []

for sub in subtitles:
    files = sub[2]
    for i in files:
        with open(f"{sub[0]}\\{i}") as f:
            lines = f.readlines()[50:]
            for line in lines:
                line = line.split(",,")
                line = line[1].split(" ")
                for word in line:
                    word = clean_word(word)
                    words.append((i, word))

In [23]:
# Dataframe for bag of words
bag_of_words = pd.DataFrame(words)

bag_of_words = pd.DataFrame(words).replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True)
bag_of_words.columns = ["episode", "word"]
bag_of_words["word"] = bag_of_words["word"].str.split(" ")
bag_of_words["count"] = 1
bag_of_words = bag_of_words.explode("word")
bag_of_words = bag_of_words.groupby(["episode", "word"]).min()

# List containing all words
all_words = [i[1] for i in bag_of_words.index]

In [24]:
bag_of_words = bag_of_words.reset_index()
word_counts = bag_of_words["word"].value_counts()[0:100].to_frame().reset_index()
word_counts.columns = ["text", "value"]

json.loads(word_counts.to_json(orient="table", index=False))["data"]

[{'text': 'my', 'value': 82},
 {'text': 'me', 'value': 82},
 {'text': 'be', 'value': 82},
 {'text': 'was', 'value': 82},
 {'text': 'like', 'value': 82},
 {'text': 'he', 'value': 82},
 {'text': 'up', 'value': 82},
 {'text': 'do', 'value': 82},
 {'text': 'and', 'value': 82},
 {'text': 'no', 'value': 82},
 {'text': 'to', 'value': 82},
 {'text': 'you', 'value': 82},
 {'text': 'are', 'value': 82},
 {'text': 'im', 'value': 82},
 {'text': 'this', 'value': 82},
 {'text': 'what', 'value': 82},
 {'text': 'for', 'value': 82},
 {'text': 'dont', 'value': 82},
 {'text': 'just', 'value': 82},
 {'text': 'on', 'value': 82},
 {'text': 'have', 'value': 82},
 {'text': 'not', 'value': 82},
 {'text': 'ill', 'value': 82},
 {'text': 'how', 'value': 82},
 {'text': 'but', 'value': 82},
 {'text': 'an', 'value': 82},
 {'text': 'about', 'value': 82},
 {'text': 'i', 'value': 82},
 {'text': 'think', 'value': 82},
 {'text': 'of', 'value': 82},
 {'text': 'right', 'value': 82},
 {'text': 'that', 'value': 82},
 {'text':

In [11]:
# Character episode counts
episode_counts = {}

for char in characters:
    for match in characters[char]:
        count = word_counts.count(match)
        if char in episode_counts.keys():
            episode_counts[char] += count
        else:
            episode_counts[char] = count

episode_counts

{'Takumi': 72,
 'Bunta': 26,
 'Ryosuke': 64,
 'Natsuki': 31,
 'Iketani': 47,
 'Keisuke': 64,
 'Mako': 20,
 'Itsuki': 48,
 'Takeshi': 30,
 'Kenji': 29}