# Task 1: Preprocess the Data

In [1]:
import pandas as pd
import re

# Dataset1: Full Target Anime Dialogue 

In [9]:
def get_clean_dialogue(file_path, episode_id):
    """ 
    Data preprocess function for the anime subtitle (for an episode). 
    Seperate dialogue, clean the text and return the dialogue in  pd.DataFrame.
    
    Args:
        file_path: file path
        episode_id: the episode id. Will be used in pairing function.
        
    Return:
        df_clean_text (pd.DataFrame): a dataframe of dialogue line by line
    """
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    # seperate each block of dialogue
    blocks = re.split(r"\n\n+", raw_text.strip())
    results = []

    # clean the dialogue
    for block in blocks:
        lines = block.strip().split("\n")

        if len(lines) < 3:
            continue

        content_lines = lines[2:]
        text = " ".join(content_lines).strip()

        text = re.sub(r"＜.*?＞", "", text)  
        text = re.sub(r"（.*?）", "", text)  
        text = re.sub(r"[～]*♪+[～<>\w\s]*", "", text)   
        text = re.sub(r"[\u200e&lrm;]", "", text)
        text = text.strip()

        # append the line
        if text:
            results.append(text)

    return pd.DataFrame({"line": results, "episode": episode_id})

def readin_all_episodes():
    """ 
    Data preprocess function for the whole season (mutiple episodes).
    
    Return:
        df_total (pd.DataFrame): a dataframe of dialogue line by line
    """
    # create empty df
    df_total = pd.DataFrame(columns=["line", "episode"])
    
    # loop through episodes
    for i in range(1, 25):
        file_path = f"data/full_dialogue/Blue.Lock.S01E{i}.WEBRip.Netflix.ja[cc].srt"
        df_new = get_clean_dialogue(file_path, i)
        
        # combine new df to the full df
        df_total = pd.concat([df_total, df_new], ignore_index=True)
    
    return df_total

In [12]:
def pair_adjacent_lines(df, text_col ,keep_cols=None, group_col=None):
    """
    Given a dataframe of dialogues row by row, pair the adjacent lines 
    together if the grouping column are the same (eg. same episode)
    
    Args:
        df (pd.DataFrame): a dataframe of dialogue line by line
        text_col (str): column name of the text
        keep_cols (list of str): name of extra columns to keep
        group_col (str): column name of the grouping column. Will only pair the ones in the same group.
        
    Return:
        df_pair (pd.DataFrame): a dataframe of previous line and current line
    """
    pairs = []
    prev_row = None

    # loop through the dataframe
    for _, row in df.iterrows():
        # check if there's a line, a prev_line, and in the same group
        if pd.notna(row[text_col]):
            if (prev_row is not None and 
                pd.notna(prev_row[text_col]) and 
                (group_col is None or row[group_col] == prev_row[group_col])):
                
                # append if applicable
                pair = {
                    "prev_line": prev_row[text_col],
                    "current_line": row[text_col]
                }

                # add addional information
                if keep_cols:
                    for col in keep_cols:
                        pair[col] = row[col]

                pairs.append(pair)

            # update previous row
            prev_row = row 

    return pd.DataFrame(pairs)


In [14]:
# read in the blue lock data
df_bluelock = readin_all_episodes()
print("# of Rows: ", len(df_bluelock))

# pair two-by-two
# eg) given 1, 2, 3 ---> (1, 2), (2, 3)
df_bluelock_paired = pair_adjacent_lines(df_bluelock, "line", group_col="episode")
df_bluelock_paired.head()

# of Rows:  9449


Unnamed: 0,prev_line,current_line
0,勝ったら全国！,いけ 潔！
1,いけ 潔！,このプレー ラストチャンス 頼むぞ！
2,このプレー ラストチャンス 頼むぞ！,全国 全国 全国… 全国！
3,全国 全国 全国… 全国！,こっちじゃ ポンコツ！
4,こっちじゃ ポンコツ！,シュート決める… シュート！


In [15]:
# save the csv
df_bluelock_paired.to_csv("data/bluelock_paired.csv")

# DataSet2: Data for Target Character 
I have manually arrange the subtitle files so that includes all dialogues spoken by my target character, Hyoma Chigiri, and all the previous line (marked "<N/A>" if the target character isn't responding). Here, I will read them into a data frame and clean the words so that it includes only plain dialogues, and then pair them together. Here, I am hard coding the function because I have manually the original data, and I am sure about how the result ought to be like.

In [2]:
def readin_and_pair_for_chigiri(file_path):
    """
    Data preprocess function for the target character. Pair the previous line and character's
    line two by two.
    
    Args:
        file_path: .str the file path
        
    Returns:
        df_pair (pd.DataFrame): a dataframe of previous line and character's line
    
    """
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    # seperate each block of dialogue
    blocks = re.split(r"\n\n+", raw_text.strip())
    contents = []
    
    # get out the dialogue text only
    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 3:
            content_lines = lines[2:]
            text = " ".join(content_lines).strip()

            text = re.sub(r"（.*?）", "", text).strip()

            contents.append(text)

    # pari two by two
    pairs = []
    for i in range(1, len(contents), 2):
        prev_line = contents[i - 1]
        chigiri_line = contents[i]
        pairs.append({
            "prev_line": prev_line,
            "chigiri_line": chigiri_line
        })

    return pd.DataFrame(pairs)

In [4]:
CHIGIRI_FILE_PATH = "data/chigiri_only.srt"
df_pair_new = readin_and_pair_for_chigiri(CHIGIRI_FILE_PATH)
df_pair_new

Unnamed: 0,prev_line,chigiri_line
0,一次セレクションはお前らのいる伍号棟 55名全５チームによる、総当たりリーグ戦 上位２チーム...,じゃあ ここにいるチームＺ 11人が１つのチームってこと？ 全員フォワードなのに？
1,勝ちゃあいいんだろ 勝ちゃあ,勝ちゃあいい… ねえ
2,つうか やっぱチーム全員が フォワードって意味分かんねえ,俺は　あいつの言ってることが全部正しいとは思ってないよ。 だって 絵心はワールドカップ優勝す...
3,何してんの？　千切,髪のケア
4,ラスト 千切は？,言いたくない
...,...,...
212,<N/A>,そういや…いつもあいつが… <name>…勝手にこのチームから出ていくんじゃねえよ、クソ<n...
213,<N/A>,１００万年ぶり
214,あれ？潔何？その“セーフ”みたいな顔 もしかして来ないかもとか思ってた？ てめえ… 上から目...,すっかり脇役扱いだな 俺ら
215,違えよ！　なんつうか その…うれしいんだよ お前らとまた会えて、単純に…,とりま 因縁の再会っつうことで


In [38]:
# save the file for future use
df_pair_new.to_csv("data/chigiri_pairs.csv")

# Dataset3: Dialogue in Japanese for Tone Prediction

In [21]:
def readin_JCHAT(file_path):
    """
    Data preprocess fucntion for EaST_MELD dataset.
    
    Args:
        file_path: the file path
        
    Return: 
        df_clean_text (pd.DataFrame): a dataframe of dialogue line by line
    """
    # the downloaded csv doesn't have headers
    # headers given in original github
    expected_columns = [
        "id", "dialogue_id", "utterance_id", "Emotion", "Sentiment",
        "Text(En)", "Text(Ja)", "Season", "Episode", "Speaker",
        "Starttime(En)", "Endtime(En)", "Starttime(Ja)", "Endtime(Ja)"
    ]

    # read in the csv and assign columns
    df = pd.read_csv(file_path, header=None)
    df.columns = expected_columns

    # get necessary columns only, convert types, and sort the rows
    df_clean = df[["dialogue_id", "Text(Ja)", "Emotion"]].copy()
    df_clean["dialogue_id"] = df_clean["dialogue_id"].astype(int)

    return df_clean

In [22]:
df_jchat = readin_JCHAT("data/EaST_MELD.csv")
df_jchat

Unnamed: 0,dialogue_id,Text(Ja),Emotion
0,0,今の会社ではシステムをKL5からGR6に変えました,neutral
1,0,大変だったろう でもやりました,neutral
2,0,,neutral
3,0,シューティーでは、職務の話ならしい,neutral
4,0,うんちデューティー? どうぞ,surprise
...,...,...,...
11816,279,誰も気づかないわよ 授業は聞いてないもん,neutral
11817,279,僕の話を?,surprise
11818,279,もちろん聞いてるわ聞かない人はいない,neutral
11819,279,モニカ徐々に直す作戦かな?,neutral


In [23]:
# pair the lines using pair_adjacent_lines()
df_jchat_paired = pair_adjacent_lines(df_jchat, "Text(Ja)", keep_cols=["Emotion"], group_col="dialogue_id")
df_jchat_paired.head()

Unnamed: 0,prev_line,current_line,Emotion
0,今の会社ではシステムをKL5からGR6に変えました,大変だったろう でもやりました,neutral
1,大変だったろう でもやりました,シューティーでは、職務の話ならしい,neutral
2,シューティーでは、職務の話ならしい,うんちデューティー? どうぞ,surprise
3,うんちデューティー? どうぞ,君は部長職だ デューティーは重いぞ,neutral
4,君は部長職だ デューティーは重いぞ,なるほど だがデューティーは部下に振り分けろ」,neutral


In [24]:
df_jchat_paired.to_csv("data/jchat_paired.csv")