In [10]:
from transformers import pipeline
import pandas as pd

modelName = "bert-base-multilingual-cased"

topResults = 100
unmasker = pipeline('fill-mask', model=modelName,top_k=topResults)

  from .autonotebook import tqdm as notebook_tqdm
2024-02-12 23:20:09.500630: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-12 23:20:09.842239: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-12 23:20:09.842396: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-12 23:20:09.878293: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-12 23:20:09.9

# Data Cleaning

In [11]:
filePath_template = "nlp-fairness-for-india-main/templates.tsv"
filePath_religion_idterms = "nlp-fairness-for-india-main/religion_idterms.tsv"
filePath_region_idterms = "nlp-fairness-for-india-main/region_idterms.tsv"
filePath_Caste_idterms = "nlp-fairness-for-india-main/caste_idterms.tsv"
humanDataset_region = "region_stereotypes.tsv"
humanDataset_religion = "religion_stereotypes.tsv"

def getTemplateArray(filePath):
    df_placeolders = pd.read_csv(filePath,sep='\t')
    df_placeolders = df_placeolders.iloc[:,1:]
    df_placeolders = df_placeolders.replace('@','[MASK]',regex=True)
    semiTemplates = df_placeolders.values.tolist()
    templates = [item for sublist in semiTemplates for item in sublist]
    return templates

def getIdentityArray(filePath):
    df_religion = pd.read_csv(filePath,sep='\t')
    religion_list = df_religion.values.tolist()
    religion_idTerms = [item for sublist in religion_list for item in sublist]
    return religion_idTerms

def replaceSlotWithWord(sentences, word):
    return word, [sentence.replace('[SLOT]', word) for sentence in sentences]


def extract_common_tuples(file1_path,file2_path,social_axis):
    file1_df = pd.read_csv(file1_path,sep='\t')
    file2_df = pd.read_csv(file2_path,sep='\t')

    merged_df = pd.merge(file1_df, file2_df, on=[social_axis, 'StereoType'], how='inner')
    merged_df.drop_duplicates(inplace=True)

    return merged_df

templates = getTemplateArray(filePath_template)

# Religion

In [12]:
religionIDTerms = getIdentityArray(filePath_religion_idterms)
religionTuples = []

for religion in religionIDTerms:
    currReligion , modifiedTemplates = replaceSlotWithWord(templates,religion)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            religionTuples.append((currReligion,res['token_str']))

df_religionTuples = pd.DataFrame(religionTuples, columns=['Religion', 'StereoType'])
output_file_path = "religionTuples_MLM.tsv"
df_religionTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Region

In [13]:
regionIDTerms = getIdentityArray(filePath_region_idterms)
regionTuples = []

for region in regionIDTerms:
    currRegion , modifiedTemplates = replaceSlotWithWord(templates,region)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            regionTuples.append((currRegion,res['token_str']))

df_regionTuples = pd.DataFrame(regionTuples, columns=['Region', 'StereoType'])
output_file_path = "regionTuples_MLM.tsv"
df_regionTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Caste

In [14]:
casteIDTerms = getIdentityArray(filePath_Caste_idterms)
casteTuples = []

for caste in casteIDTerms:
    currCaste , modifiedTemplates = replaceSlotWithWord(templates,caste)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            casteTuples.append((currCaste,res['token_str']))

df_regionTuples = pd.DataFrame(casteTuples, columns=['Region', 'StereoType'])
output_file_path = "casteTuples_MLM.tsv"
df_regionTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")