In [11]:
import re
import pandas as pd
import numpy as np

# Friends Dataset Preprocessing

In [2]:
# Load the transcript file
with open('datasets/Friends_Transcript.txt', 'r', encoding='utf-8') as file:
    friends_data = file.read()

In [3]:
# Remove director's notes (anything within parentheses)
cleaned_friends = re.sub(r'\(.*?\)', '', friends_data)

In [4]:
# Split the text by scenes and remove whitespaces
scenes = re.split(r'\[Scene:.*?\]', cleaned_friends)
scenes = [scene.strip() for scene in scenes if scene.strip()]

In [5]:
# Save the scenes into a CSV file
df = pd.DataFrame({'Scene': scenes})
df.to_csv('clean_datasets/Friends_Transcript_Preprocessed.csv', index=False)

# Google Emotion Dataset Preprocessing

In [7]:
# Load the dataset
emotions_df = pd.read_csv('datasets/goemotions_combined_with_indices.csv')
emotions_df.head()

Unnamed: 0.1,Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,label,annotator_label
0,0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,...,0,0,0,0,0,1,0,0,25,25
1,1,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,72,False,...,0,0,0,0,1,0,0,0,25,24
2,2,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,52,False,...,0,0,0,0,0,1,0,0,25,25
3,3,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,20,False,...,0,0,0,0,0,0,0,0,25,9
4,4,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,26,False,...,0,0,0,0,0,0,0,1,25,27


In [13]:
emotion_columns = ['optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

# Calculate majority vote for each emotion
df_majority_vote = emotions_df.groupby('id').agg({col: lambda x: np.round(x.mean()) for col in emotion_columns}).reset_index()

# Merge back with the original text to ensure each text is unique
df_text = emotions_df[['id', 'text']].drop_duplicates()
df_combined = pd.merge(df_text, df_majority_vote, on='id')

In [14]:
df_combined

Unnamed: 0,id,text,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ed2mah1,"You do right, if you don't care then fuck 'em!",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,eeibobj,Man I love reddit.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,eespn2i,Right? Considering it’s such an important docu...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
54258,ed0052l,Oh [NAME] i forgot about those. I used to love...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54259,ee4d7h6,"So your sticking by the term ""extremist"" for t...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54260,eed7qdq,He called [NAME] to the Steelers and was outsp...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
54261,edirq0m,"Thanks, [NAME]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Concatenate relevant emotions at the end of the text
def concat_emotions(row):
    emotions = [col for col in emotion_columns if row[col] == 1]
    return f"{row['text']} {' '.join(emotions)}" if emotions else row['text']

In [20]:
df_combined['text_with_emotions'] = df_combined.apply(concat_emotions, axis=1)
df_combined

Unnamed: 0,id,text,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,text_with_emotions
0,eew5j0j,That game hurt.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,That game hurt.
1,ed2mah1,"You do right, if you don't care then fuck 'em!",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"You do right, if you don't care then fuck 'em!"
2,eeibobj,Man I love reddit.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Man I love reddit.
3,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[NAME] was nowhere near them, he was by the Fa..."
4,eespn2i,Right? Considering it’s such an important docu...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Right? Considering it’s such an important docu...
...,...,...,...,...,...,...,...,...,...,...,...
54258,ed0052l,Oh [NAME] i forgot about those. I used to love...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Oh [NAME] i forgot about those. I used to love...
54259,ee4d7h6,"So your sticking by the term ""extremist"" for t...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"So your sticking by the term ""extremist"" for t..."
54260,eed7qdq,He called [NAME] to the Steelers and was outsp...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,He called [NAME] to the Steelers and was outsp...
54261,edirq0m,"Thanks, [NAME]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Thanks, [NAME]"


In [22]:
# Check that the concatenation worked
df_combined[df_combined['id'] == 'eda6yn6']['text_with_emotions']

3    [NAME] was nowhere near them, he was by the Fa...
Name: text_with_emotions, dtype: object

In [23]:
# Save the preprocessed dataset
result = df_combined[['text_with_emotions']]
result.to_csv('clean_datasets/Google_Emotions_Preprocessed.csv', index=False)

# Reddit Dataset Preprocessing

In [25]:
reddit_df = pd.read_csv('datasets/reddit_relationship_advice.csv')
reddit_df.head()

Unnamed: 0,example_id,split,batch,post,comment_1,comment_2,comment_1_label,comment_2_label,more_helpful_comment,annotator1_t1_label1,...,annotator3_t1_label1,annotator4_t1_label1,annotator1_t1_label2,annotator2_t1_label2,annotator3_t1_label2,annotator4_t1_label2,annotator1_t2_label,annotator2_t2_label,annotator3_t2_label,annotator4_t2_label
0,1,train,exploration,"i know we’re young, we make mistakes. but ther...",Here's an advice from an older man to a young ...,It sounds like he still has a cheating heart a...,Practical Advice,Commentator's opinion,Comment 1,Practical Advice,...,Practical Advice,,Commentators opinion,Commentators opinion,Commentators opinion,,Comment 1,Comment 2,Comment 1,
1,2,train,exploration,"So, my girlfriend and I have been together for...",The gut never lies. Something is definitely wr...,Yea that’s a very intentional action taken by ...,Practical Advice,Practical Advice,Comment 2,Practical Advice,...,Practical Advice,,Practical Advice,Practical Advice,Practical Advice,,Comment 2,Comment 2,Comment 2,
2,3,train,exploration,12/31/2019: got married to wife in the US.\n\n...,If you stay with her the cheating will never e...,What do you look for now? A divorce attorney,Practical Advice,Sarcasm,Comment 1,Practical Advice,...,Commentators opinion,,Sarcasm,Sarcasm,Sarcasm,,Comment 1,Comment 1,Comment 1,
3,4,train,exploration,Throwaway because he knows I lurk on here. \nI...,"Ultimately, it's your decision to believe he i...","Well, you can’t hold it against him now as he ...",Practical Advice,Commentator's opinion,Comment 1,Practical Advice,...,Practical Advice,,Commentators opinion,Commentators opinion,Commentators opinion,,Comment 1,Comment 1,Comment 1,
4,5,train,exploration,\nSummary of the last [post](https://www.reddi...,He misses having a steady source of sex.,Block his email(s).,Commentator's opinion,Practical Advice,Comment 2,Commentators opinion,...,Commentators opinion,,Practical Advice,Practical Advice,Practical Advice,,Comment 2,Comment 2,Comment 1,


In [27]:
# Explode the comments into separate rows
df_exploded = pd.DataFrame({
    'post': pd.concat([reddit_df['post'], reddit_df['post']]),
    'comment': pd.concat([reddit_df['comment_1'], reddit_df['comment_2']]),
    'label': pd.concat([reddit_df['comment_1_label'], reddit_df['comment_2_label']])
})

In [36]:
# Concatenate the post text with the comment
df_exploded['full_text'] = df_exploded['post'] + " " + df_exploded['comment']

# Concatenate the label to the full text
df_exploded['full_text_with_label'] = df_exploded['full_text'] + " [" + df_exploded['label'] + "]"

In [40]:
df_result = df_exploded[['full_text_with_label']]

In [41]:
df_result.to_csv('clean_datasets/Reddit_Relationship_Advice_Preprocessed.csv', index=False)