# Tripto et al Data - Analysis and Filtering

In [None]:
import pandas as pd
import re

In [None]:
allowed_length = 6

file_path = "archive/Emotion.csv" # path_to_data_file
# Read the CSV file
df = pd.read_csv(file_path, on_bad_lines="warn", delimiter=";")

# Filter rows where the language is Bengali
bn_df = df[df["lan"] == "BN"]

# Filter rows with more than allowed_length words
length_filtered = bn_df[bn_df["text"].apply(lambda x: len(x.split()) > allowed_length)]

# Remove the 'lan' column
length_filtered = length_filtered.drop(columns=["lan"])

# Remove duplicates based on the 'text' column
length_filtered_unique_text_df = length_filtered.drop_duplicates(subset="text")

# Remove leading and trailing newline characters from the 'text' column
length_filtered_unique_text_df["text"] = length_filtered_unique_text_df["text"].apply(lambda x: x.strip("\n"))

# Remove rows where the 'emotion' column is "none"
length_filtered_unique_text_df = length_filtered_unique_text_df[length_filtered_unique_text_df["emotion"] != "none"]

# Filter out rows with English letters using regular expression pattern
length_filtered_unique_text_df = length_filtered_unique_text_df[length_filtered_unique_text_df["text"].apply(lambda x: not bool(re.search(r'[A-Za-z]', x)))]

# Print the count of selected unique text entries and their emotion value counts
print(f"Selected {len(length_filtered_unique_text_df)} unique text entries with more than {allowed_length} words")
print(length_filtered_unique_text_df["emotion"].value_counts())

# Save the filtered DataFrame to a new CSV file
length_filtered_unique_text_df.to_csv("archive_tripto.csv", index=False)


# BN_Emo_Dataset Analysis


In [None]:
def read_text_file(file_path):
    """Read text from a file."""
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    return lines

def parse_text_lines(lines):
    """Parse text lines and create dictionaries."""
    data = []
    for line in lines:
        # Split the line into emotion and text
        emotion, text = line.strip().split(" ", 1)
        # Create a dictionary and append it to the list
        data.append({"emotion": emotion, "text": text})
    return data

# Example usage
file_path = "BN_Emo_Data/corpus_all.txt"
lines = read_text_file(file_path)
data = parse_text_lines(lines)
print(data[:2])


In [None]:
df = pd.DataFrame(data)
df = df.drop_duplicates(subset="text")
df.head()

In [None]:
df.describe()

In [None]:
df["emotion"].value_counts()

In [None]:
length_filtered_df = df[df["text"].apply(lambda x: len(x.split()) > 5 and len(x.split()) < 18)]
length_filtered_df["emotion"].value_counts()

In [None]:
length_filtered_df.to_csv("./bn_emo_data.csv")

# AACL PAPER Dataset

In [None]:
# merge the 3 files of the dataset: test, train, val
train_file_path = "2022.aacl-short.17.Dataset/EmoNoBa Dataset/Train.csv"
val_file_path = "2022.aacl-short.17.Dataset/EmoNoBa Dataset/Val.csv"
test_file_path = "2022.aacl-short.17.Dataset/EmoNoBa Dataset/Test.csv"

df_train = pd.read_csv(train_file_path)
df_val = pd.read_csv(val_file_path)
df_test = pd.read_csv(test_file_path)

df = pd.concat([df_train, df_val, df_test])

df = df.drop_duplicates(subset="Data")

In [None]:
len(df)

In [None]:
df.head()

In [None]:
# Get the emotion with the highest value for each row
emotions = df[['Love', 'Joy', 'Surprise', 'Anger', 'Sadness', 'Fear']].idxmax(axis=1)

# Map the column names to the corresponding emotions
emotion_names = {'Love': 'Love', 'Joy': 'Joy', 'Surprise': 'Surprise', 'Anger': 'Anger', 'Sadness': 'Sadness', 'Fear': 'Fear'}
emotions = emotions.map(emotion_names)

# Add a new column named 'emotion'
df['emotion'] = emotions

# Drop the previous columns of emotions and additional columns
df = df.drop(columns=['Love', 'Joy', 'Surprise', 'Anger', 'Sadness', 'Fear', 'is_admin'])


print(df.head())


In [None]:
df["emotion"].value_counts()

In [None]:
# Filter based on length criteria
length_filtered_df = df[df["Data"].apply(lambda x: 8 <= len(x.split()) < 18)]

# Remove duplicates
length_filtered_df = length_filtered_df.drop_duplicates(subset="Data")

# Get 10% of data with length less than 8
less_than_8_df = df[df["Data"].apply(lambda x: 4 <= len(x.split()) < 8)].sample(frac=0.1)
print(len(less_than_8_df))

# Get the remaining 90% of data from length_filtered_df
greater_than_equal_to_8_df = length_filtered_df.sample(frac=0.7)

# Concatenate both DataFrames
filtered_df = pd.concat([less_than_8_df, greater_than_equal_to_8_df])

# Shuffle the final DataFrame
filtered_df = filtered_df.sample(frac=1)
print(len(filtered_df))


In [None]:
filtered_df = filtered_df[~filtered_df["Data"].str.contains("http")]
filtered_df["emotion"].value_counts()

In [None]:
filtered_df = filtered_df.apply(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
print(f"Total DataPoints: {len(filtered_df)}")

In [None]:
filtered_df["text"] = filtered_df["Data"]
filtered_df = filtered_df.drop(columns=["Data"])
filtered_df.to_csv("aacl_dataset.csv", index=False)

In [None]:
filtered_df['Topic'].value_counts()

# Data Analysis

In [None]:
import pandas as pd

df = pd.read_csv("merged_dataset.csv")
filtered_df = df[~df["text"].str.contains("http")]
print(f"Removed {len(df) - len(filtered_df)} rows with URLs")
filtered_df["emotion"].value_counts()

## Merge the Gen Dataset to Origin

In [None]:
import pandas as pd
import glob

final_dataset_filepath = ""
final_dataset = pd.read_csv(final_dataset_filepath)

In [None]:
def read_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    return lines




In [None]:
directory_path = '../../Storage'
pattern = 'woman*'

# Use glob.glob() to get a list of files matching the pattern
matching_files = glob.glob(directory_path + '/*/' + pattern)

# Now matching_files contains a list of file paths matching the pattern in the specified directory
print(len(matching_files))

In [None]:
for file_path in matching_files:
    lines = read_file(file_path)
    if len(lines) > 1:
        print(file_path)
        print(lines)

## Transform the raw response data to csv

In [None]:
import os
import pandas as pd

raw_dataset_path = "" # the comments dataset
storage_path = "" # path where the LLM responses are stored
model_name = "" # name of the model of inference, consistent witht he column name like gpt_4o
saved_csv_filename = "" # name of the saved csv file e.g. gpt_4o_dataframe_i2_raw.csv

# Read the original DataFrame
df = pd.read_csv(raw_dataset_path)

# Define a function to read the response files
def read_response_files(row):
    id_folder = str(row['ID'])
    if os.path.exists(os.path.join(storage_path, id_folder)):
        print(f"Processing ID: {id_folder}")
    else:
        print(f"Folder not found for ID: {id_folder}")
        return row
    man_file_path = os.path.join(storage_path, id_folder, f"man_{model_name}_response.txt")
    woman_file_path = os.path.join(storage_path, id_folder, f"woman_{model_name}_response.txt")
    
    # Initialize response variables
    man_response = ""
    woman_response = ""
    
    # Read man response file
    if os.path.exists(man_file_path):
        with open(man_file_path, 'r', encoding='utf-8') as man_file:
            man_response = man_file.read()
    else:
        print("man response not found")
    
    # Read woman response file
    if os.path.exists(woman_file_path):
        with open(woman_file_path, 'r', encoding='utf-8') as woman_file:
            woman_response = woman_file.read()
    else:
        print("women response not found")
    
    # Update the row with response data
    row[f"man_{model_name}_response"] = man_response
    row[f"woman_{model_name}_response"] = woman_response
    
    return row

# Apply the function to each row of the DataFrame
df_with_responses = df.apply(read_response_files, axis=1)

# Save the new DataFrame to a CSV file
df_with_responses.to_csv(saved_csv_filename, index=False)


## Extract the multiple lines response from dataframe

See the unique values

In [None]:
import pandas as pd
filename = "" # the file containing responses
response_df = pd.read_csv(filename)
response_df.fillna("", inplace=True)
response_df.head()

In [None]:
# gender = "man"
gender = "woman"
# column = f"{gender}_llama_3_response"
column = f"{gender}_gpt_3_5_turbo_response"

In [None]:

emotion_words = response_df[column].unique().tolist()
print(f"Analyzing for {gender} Emotional Attributes")
print("-"*50)
print(f"Total Number of Unique values: {len(emotion_words)}")
print(f"Value Counts:\n")
values = response_df[column].value_counts().to_dict()
for k, v in values.items():
    print(f"\"{k}\": {v},")

In [None]:
from normalizer import normalize
# replacement_column = "man_gpt_3_5_turbo_response"
replacement_column = column
# replacement_column = "woman_gpt_3_5_turbo_response"

def strip_word(row):
    return normalize(row[replacement_column].strip())

def remove_end_marks(row):
    return row[replacement_column].strip().rstrip("।").rstrip("!").rstrip(".").strip("\"")

response_df[replacement_column] = response_df.apply(strip_word, axis=1)

In [None]:
response_df.to_csv(filename, index=False)

In [None]:
long_words = []
for words in man_emotion_words:
    if len(words.split()) > 1:
        print(words)
        long_words.append(words)

In [None]:
woman_long_words = []
for words in woman_emotion_words:
    if len(words.split()) > 1:
        print(words)
        woman_long_words.append(words)

## Clean the dataset to have a single response.

In [None]:
from normalizer import normalize 

In [None]:
change_file_path = "" # pth to the file where substitutes are placed
with open(change_file_path, "r") as file:
    lines = file.readlines()
print(f"total lines read: {len(lines)}")
replacement_dict= {}
for line in lines:
    line = line.strip("\n")
    replacement_dict[line.split("->")[0]] = normalize(line.split("->")[1].strip())
print(len(replacement_dict.keys()))
replacement_dict

In [None]:
import pandas as pd

csv_file = "" # the file containing responses of models merged with actual dataset

df = pd.read_csv(csv_file)
df = df.fillna('')
df.head()

In [None]:
# Function to replace values
dict_keys = replacement_dict.keys()
normal_dict = {}
for key in dict_keys:
    normal_dict[key] = normalize(key).strip() 

replacement_column = 'man_gpt_3_5_turbo_response'

# replacement_column = 'woman_gpt_3_5_turbo_response'

def replace_values(row):
    stripped_value = normalize(row[replacement_column].strip())
    for line in dict_keys:
        normal_line = normal_dict[line]
        if (stripped_value == normal_line):
            print(f"Corrected: {stripped_value} (replacement: {replacement_dict[line]})")
            return replacement_dict[line]
    splitted_value = stripped_value.split()
    if len(splitted_value) > 1:
        if len(splitted_value) == 2 and splitted_value[-1] == '।':
            return splitted_value[0]
        print(f"Not found: {stripped_value}")
    return row[replacement_column]

# Apply the function to the specified column
df[replacement_column] = df.apply(replace_values, axis=1)

In [None]:
df.to_csv(csv_file, index=False)

## Chatgpt use to clean the dataset.

In [None]:
subs_not_found_path = "" # the instances where data is not found for replacement
with open(subs_not_found_path, "r") as file:
    lines = file.readlines()

segment_length = 40
all_data = []
total_data_points = 0
for i in range(len(lines)//segment_length+1):
    segment = lines[i*segment_length:(i+1)*segment_length]
    total_data_points += len(segment)
    all_data.append(segment)

print(f"Total data points: {total_data_points}")
assert(total_data_points == len(lines))

In [None]:
# creating the prompts
prompts = []
for i in range(len(all_data)):
    prompt = ";".join(all_data[i])
    prompts.append(prompt)

In [None]:
system_instruct = '''You are good at parsing Bangla. I am going to give you some Bangla phrases that expresses some emotions. You will be given the sentences with ; as delimeter.
Your work is to extract the emotion from the phrase and express it in ONE WORD. Remember the emotion might not be explicitly mentioned always. In those cases infer it from the phrase.
You can select from the following emotions: রাগ, দুঃখ, আনন্দ, বিস্ময়, ভয়, অপরাধবোধ, বিরক্তি, লজ্জা but you can also use other words if you think the emotion is not in the list.
Input is like this: আমি ব্যস্ত।;আমার জন্য আদর্শ এবং শ্রদ্ধা।;শহীদ্গঞ্জার সাহসী ।
Output is like:''' 
output = '''আমি ব্যস্ত।->ব্যস্ত;
আমার জন্য আদর্শ এবং শ্রদ্ধা।->শ্রদ্ধা(note that আদর্শ is not an emotion word);
শহীদ্গঞ্জার সাহসী ।->আনন্দ'''

model_message = [
    {"role": "system", "content": system_instruct.replace("\n", " ")+output},
    {"role": "user", "content": prompt},
]

model_message

In [None]:
prompts[0]

In [None]:
model_name = "gpt-3.5-turbo"
from openai import OpenAI
client = OpenAI(api_key="") # insert your api key here

In [None]:
for i, prompt in enumerate(prompts):
    if i < 4:
        continue
    print(f"Processing segment {i+1}")
    model_message = [
        {"role": "system", "content": system_instruct.replace("\n", " ")+output},
        {"role": "user", "content": prompt},
    ]
    print(model_message)
    completion = client.chat.completions.create(
                model=model_name, messages=model_message
            )
    print(completion.choices[0].message)
    folder_path = "" # directory to save the data
    with open(f"{folder_path}/chatresponse{i}", "w") as file:
        file.writelines(completion.choices[0].message.content)

In [None]:
completion

In [None]:
subs_response_file_path = "" # file path to 
with open(subs_response_file_path, "r") as file:
    line = file.readline()
    print(line)

responses = line.split(";")
len(responses)



## Find the unique words for male and female

In [None]:
import pandas as pd
filename = "" # the file containing responses of models merged with actual dataset
response_df = pd.read_csv(filename)
response_df.fillna("", inplace=True)
response_df.head()

In [None]:
gender = "man"
gender2 = "woman"
# column = f"{gender}_llama_3_response"
male_column = f"{gender}_gpt_3_5_turbo_response"
female_column = f"{gender2}_gpt_3_5_turbo_response"

# male_column = f"man_llama_3_response"
# female_column = f"woman_llama_3_response"

In [None]:
male_emotion_words = response_df[male_column].unique().tolist()
male_emotion_value_count = response_df[male_column].value_counts()

female_emotion_words = response_df[female_column].unique().tolist()
female_emotion_value_count = response_df[female_column].value_counts()

men_unique_words = []
female_unique_words = []

for word in male_emotion_words:
    if word not in female_emotion_words:
        men_unique_words.append((word, male_emotion_value_count[word]))

for word in female_emotion_words:
    if word not in male_emotion_words:
        female_unique_words.append((word, female_emotion_value_count[word]))

men_unique_words.sort(key=lambda x: x[1], reverse=True)
female_unique_words.sort(key=lambda x: x[1], reverse=True)
print("Chatgpt v2")
print("The words that are available in male but not available in female emotion words:")
for word, count in men_unique_words:
    print(f"{word}: {count}")

print("The words that are available in female but not available in male emotion words:")
for word, count in female_unique_words:
    print(f"{word}: {count}")

## Find word distribution for each Gendered response

In [None]:
# csv_filename = ".Final_Versions/chatgpt_dataframe_v2.csv"

#llama3_v1
csv_filename = "" # the response filename

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(csv_filename)

# Create a dictionary to store emotion counts for male responses
male_emotion_counts = df['man_gpt_3_5_turbo_response'].value_counts()

# Create a dictionary to store the top emotions for each emotion in male responses
top_emotions_for_each_emotion = {}

# Loop through each unique male emotion
for emotion in df['man_gpt_3_5_turbo_response'].unique():
    # Filter the dataframe for the current male emotion
    filtered_df = df[df['man_gpt_3_5_turbo_response'] == emotion]
    
    # Count occurrences of female emotions in responses corresponding to the current male emotion
    female_emotion_counts = filtered_df['woman_gpt_3_5_turbo_response'].value_counts().head(7)
    
    # Store the top female emotions for the current male emotion
    top_emotions_for_each_emotion[emotion] = female_emotion_counts

# Output the results
for male_emotion, count in male_emotion_counts.items():
    top_emotions_str = ", ".join([f"{female_emotion}({female_count})" for female_emotion, female_count in top_emotions_for_each_emotion.get(male_emotion, {}).items()])
    print(f"{male_emotion}({count}): {top_emotions_str}")


In [None]:
# for woman
import pandas as pd

# Read the CSV file
df = pd.read_csv(csv_filename)

# Create a dictionary to store emotion counts for male responses
female_emotion_counts = df['woman_gpt_3_5_turbo_response'].value_counts()

# Create a dictionary to store the top emotions for each emotion in male responses
top_emotions_for_each_emotion = {}

# Loop through each unique male emotion
for emotion in df['woman_gpt_3_5_turbo_response'].unique():
    # Filter the dataframe for the current male emotion
    filtered_df = df[df['woman_gpt_3_5_turbo_response'] == emotion]
    
    # Count occurrences of female emotions in responses corresponding to the current male emotion
    male_emotion_counts = filtered_df['man_gpt_3_5_turbo_response'].value_counts().head(7)
    
    # Store the top female emotions for the current male emotion
    top_emotions_for_each_emotion[emotion] = male_emotion_counts

# Output the results
for female_emotion, count in female_emotion_counts.items():
    top_emotions_str = ", ".join([f"{male_emotion}({male_count})" for male_emotion, male_count in top_emotions_for_each_emotion.get(female_emotion, {}).items()])
    print(f"{female_emotion}({count}): {top_emotions_str}")


For llama3

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(csv_filename)

# Create a dictionary to store emotion counts for male responses
male_emotion_counts = df['man_llama_3_response'].value_counts()

# Create a dictionary to store the top emotions for each emotion in male responses
top_emotions_for_each_emotion = {}

# Loop through each unique male emotion
for emotion in df['man_llama_3_response'].unique():
    # Filter the dataframe for the current male emotion
    filtered_df = df[df['man_llama_3_response'] == emotion]
    
    # Count occurrences of female emotions in responses corresponding to the current male emotion
    female_emotion_counts = filtered_df['woman_llama_3_response'].value_counts().head(7)
    
    # Store the top female emotions for the current male emotion
    top_emotions_for_each_emotion[emotion] = female_emotion_counts

# Output the results
for male_emotion, count in male_emotion_counts.items():
    top_emotions_str = ", ".join([f"{female_emotion}({female_count})" for female_emotion, female_count in top_emotions_for_each_emotion.get(male_emotion, {}).items()])
    print(f"{male_emotion}({count}): {top_emotions_str}")


In [None]:
# for woman
import pandas as pd

# Read the CSV file
df = pd.read_csv(csv_filename)

# Create a dictionary to store emotion counts for male responses
female_emotion_counts = df['woman_llama_3_response'].value_counts()

# Create a dictionary to store the top emotions for each emotion in male responses
top_emotions_for_each_emotion = {}

# Loop through each unique male emotion
for emotion in df['woman_llama_3_response'].unique():
    # Filter the dataframe for the current male emotion
    filtered_df = df[df['woman_llama_3_response'] == emotion]
    
    # Count occurrences of female emotions in responses corresponding to the current male emotion
    male_emotion_counts = filtered_df['man_llama_3_response'].value_counts().head(7)
    
    # Store the top female emotions for the current male emotion
    top_emotions_for_each_emotion[emotion] = male_emotion_counts

# Output the results
for female_emotion, count in female_emotion_counts.items():
    top_emotions_str = ", ".join([f"{male_emotion}({male_count})" for male_emotion, male_count in top_emotions_for_each_emotion.get(female_emotion, {}).items()])
    print(f"{female_emotion}({count}): {top_emotions_str}")
