# EBE Gender Dataset

In [None]:
import pandas as pd

ebe_gender_data_file = "" # path_to_file
df = pd.read_csv(ebe_gender_data_file)



In [None]:
df['explicitGender'].value_counts()

In [None]:
!export $(cat .env | xargs) && env

The objective here is to filter out the sentences that does not make any sense semantically and select those sentences that can be generalized for both genders. We check only the pair sentences (the sentences where the subject from natural sentences is replaced by opposing gender entity).

In [None]:
from openai import OpenAI
from datetime import datetime
import pandas as pd

client = OpenAI(api_key="") # place your key here
model_name = "gpt-3.5-turbo"

system_prompt = '''You are an assistant. Your job is to determine whether a given prompt Bangla is semantically correct and does not 
contradict nature of men. Respond with 1 for yes and 0 for no and provide no other extra answer.'''

def sanitize_log_name(filename):
    return filename.replace(" ", "_").replace(":", "_").replace("-", "_")

def create_message(text, pair):
    return [
        {"role": "system", "content": system_prompt.replace("\n", " ")},
        {"role": "user", "content": f"Text: {pair}"}
    ]

def validate_data(df):
    count = 0

    f = open(sanitize_log_name(f"./logs/data_generation_{datetime.now()}.log"), "w")
    for index, row in df.iterrows():
        if row['selected'] != -2:
            continue
        if row["explicitGender"] != 'female':
            continue
        text = row["text"]
        pair = row['pair']
        if len(pair.split()) < 6: 
            print(f"Index: {index}, sentence: {pair}, content: {0} -> too short")
            f.write(f"Index: {index}, sentence: {pair}, content: {0} -> too short\n")
            df.at[index, 'selected'] = 0
            continue
        model_message = create_message(text, pair)

        completion = client.chat.completions.create(
            model=model_name, messages=model_message, temperature=0.1
        )

        content = completion.choices[0].message.content
        print(f"Index: {index}, sentence: {pair}, content: {content}")
        f.write(f"Index: {index}, sentence: {pair}, content: {content}\n")
        if content == "0" or content == "no":
            df.at[index, 'selected'] = 0
        elif content == "1" or content == "yes":
            df.at[index, 'selected'] = 1
        else:
            df.at[index, 'selected'] = -1
            print(f"Error at index: {index}")
            f.write(f"Error at index: {index}\n")

        count += 1
        if count == 2000:
            break

        if count % 20 == 0 and count > 1:
            df.to_csv("./ebe_gender_data_selection.csv", index=False) 

gender_ebe = pd.read_csv("./ebe_gender_data_selection.csv")

validate_data(gender_ebe)

gender_ebe.to_csv("./ebe_gender_data_selection.csv", index=False)
    

In [None]:
import pandas
df = pd.read_csv("./ebe_gender_data_selection.csv")

df.head()

In [None]:
df['selected'].value_counts()

In [None]:
df_selected = df[df['selected']==1]

# Religion EBE Dataset

The objective here is to filter out the sentences that does not make any sense semantically and select those sentences that can be generalized for both genders. We check only the pair sentences (the sentences where the subject from natural sentences is replaced by opposing gender entity). We use Chatgpt-3.5 for this purpose. One can change the explicitReligion selection inside the code to be muslim as well

In [None]:
from openai import OpenAI
from datetime import datetime
import pandas as pd

client = OpenAI(api_key="")
model_name = "gpt-3.5-turbo"

system_prompt = '''You are an assistant. Your job is to determine whether a given prompt is semantically correct and does not 
contradict nature of a Muslim religion, like idol worshipping or Hindu name or wrong religious norm or wrong religious book etc. Respond with 1 for yes and 0 for no and provide no other extra answer.'''

def sanitize_log_name(filename):
    return filename.replace(" ", "_").replace(":", "_").replace("-", "_")

def create_message(text, pair):
    return [
        {"role": "system", "content": system_prompt.replace("\n", " ")},
        {"role": "user", "content": f"Text: {pair}"}
    ]

def validate_data(df):
    count = 0

    f = open(sanitize_log_name(f"./logs/data_generation_{datetime.now()}.log"), "w")
    f.write("RELIGION\n\n")
    for index, row in df.iterrows():
        if row['selected'] != -2:
            continue
        if row["explicitReligion"] != 'hindu':
            continue
        text = row["text"]
        pair = row['pair']
        if len(pair.split()) < 6: 
            print(f"Index: {index}, sentence: {pair}, content: {0} -> too short")
            f.write(f"Index: {index}, sentence: {pair}, content: {0} -> too short\n")
            df.at[index, 'selected'] = 0
            continue
        model_message = create_message(text, pair)

        completion = client.chat.completions.create(
            model=model_name, messages=model_message, temperature=0.1
        )

        content = completion.choices[0].message.content
        print(f"Index: {row['ID']}, sentence: {pair}, content: {content}")
        f.write(f"Index: {row['ID']}, sentence: {pair}, content: {content}\n")
        if content == "0" or content == "no":
            df.at[index, 'selected'] = 0
        elif content == "1" or content == "yes":
            df.at[index, 'selected'] = 1
        else:
            df.at[index, 'selected'] = -1
            print(f"Error at index: {row['ID']}")
            f.write(f"Error at index: {row['ID']}\n")

        count += 1
        if count == 1000:
            break

        if count % 20 == 0 and count > 1:
            df.to_csv("./ebe_religion_selection.csv", index=False) 

religion_ebe = pd.read_csv("./ebe_religion_selection.csv")

validate_data(religion_ebe)

religion_ebe.to_csv("./ebe_religion_selection.csv", index=False)
    

In [None]:
religion_ebe = pd.read_csv("./ebe_religion_selection.csv")

In [None]:
religion_ebe['explicitReligion'].value_counts()

In [None]:
df = religion_ebe.sample(frac = 0.2)
df["explicitReligion"].value_counts()

In [None]:
import pandas as pd

# Define the fraction to sample
frac = 0.18

# Calculate the number of samples needed
total_samples = int(len(religion_ebe) * frac)
samples_per_class = total_samples // religion_ebe['explicitReligion'].nunique()

# Group by the 'explicitReligion' column and sample
balanced_df = religion_ebe.groupby('explicitReligion').apply(lambda x: x.sample(samples_per_class))

print("Balanced Sampled DataFrame:")
print(balanced_df)


In [None]:
balanced_df['explicitReligion'].value_counts()

In [None]:
balanced_df.to_csv("./ebe_religion_data_selection.csv", index=False)

In [None]:
balanced_df = pd.read_csv("/home/jayanta/Documents/CodeReviewAnnotationProject/DataProcessor/ebe_religion_data_selection.csv")

balanced_df

In [None]:
refined_df = balanced_df[~balanced_df['text'].str.contains("দেবী")]

len(refined_df)

In [None]:
refined_df = refined_df[~refined_df['text'].str.contains("ঈদ")]

len(refined_df)

In [None]:
refined_df.to_csv("./ebe_religion_data_selection.csv", index=False)