In [None]:
from transformers import pipeline
import pandas as pd
import itertools

modelName = "bert-base-multilingual-cased"

topResults = 100
unmasker = pipeline('fill-mask', model=modelName,top_k=topResults)

# Data Cleaning

In [None]:
filePath_template = "../nlp-fairness-for-india-main/templates.tsv"
filePath_religion_idterms = "../nlp-fairness-for-india-main/religion_idterms.tsv"
filePath_region_idterms = "../nlp-fairness-for-india-main/region_idterms.tsv"
filePath_Caste_idterms = "../nlp-fairness-for-india-main/caste_idterms.tsv"
filePath_gender_idterms = "../nlp-fairness-for-india-main/gender_idterms.tsv"
humanDataset_region = "region_stereotypes.tsv"
humanDataset_religion = "religion_stereotypes.tsv"

def getTemplateArray(filePath):
    df_placeolders = pd.read_csv(filePath,sep='\t')
    df_placeolders = df_placeolders.iloc[:,1:]
    df_placeolders = df_placeolders.replace('@','[MASK]',regex=True)
    semiTemplates = df_placeolders.values.tolist()
    templates = [item for sublist in semiTemplates for item in sublist]
    return templates

def getIdentityArray(filePath):
    df_religion = pd.read_csv(filePath,sep='\t')
    religion_list = df_religion.values.tolist()
    religion_idTerms = [item for sublist in religion_list for item in sublist]
    return religion_idTerms

def replaceSlotWithWord(sentences, word):
    return word, [sentence.replace('[SLOT]', word) for sentence in sentences]


def extract_common_tuples(file1_path,file2_path,social_axis):
    file1_df = pd.read_csv(file1_path,sep='\t')
    file2_df = pd.read_csv(file2_path,sep='\t')

    merged_df = pd.merge(file1_df, file2_df, on=[social_axis, 'StereoType'], how='inner')
    merged_df.drop_duplicates(inplace=True)

    return merged_df

def crossProduct(list1, list2):
    cross_product = list(itertools.product(list1, list2))
    return cross_product


templates = getTemplateArray(filePath_template)

# Religion

In [None]:
religionIDTerms = getIdentityArray(filePath_religion_idterms)
religionTuples = []

for religion in religionIDTerms:
    currReligion , modifiedTemplates = replaceSlotWithWord(templates,religion)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            religionTuples.append((currReligion,res['sequence']))

df_religionTuples = pd.DataFrame(religionTuples, columns=['Religion', 'StereoType'])
output_file_path = "BERT/religionTuplesFullSequence_MLM.tsv"
df_religionTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Region

In [None]:
regionIDTerms = getIdentityArray(filePath_region_idterms)
regionTuples = []

for region in regionIDTerms:
    currRegion , modifiedTemplates = replaceSlotWithWord(templates,region)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            regionTuples.append((currRegion,res['sequence']))

df_regionTuples = pd.DataFrame(regionTuples, columns=['Region', 'StereoType'])
output_file_path = "BERT/regionTuplesFullSequence_MLM.tsv"
df_regionTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Caste

In [None]:
casteIDTerms = getIdentityArray(filePath_Caste_idterms)
casteTuples = []

for caste in casteIDTerms:
    currCaste , modifiedTemplates = replaceSlotWithWord(templates,caste)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            casteTuples.append((currCaste,res['sequence']))

df_regionTuples = pd.DataFrame(casteTuples, columns=['Region', 'StereoType'])
output_file_path = "BERT/casteTuplesFullSequence_MLM.tsv"
df_regionTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Region x Gender

In [None]:
regionIDTerms = getIdentityArray(filePath_region_idterms)
genderIDTerms = getIdentityArray(filePath_gender_idterms)
region_genderIDTerms = crossProduct(regionIDTerms,genderIDTerms)

region_genderTuples = []
for item in region_genderIDTerms:
    word = item[0] + " " +item[1]
    print(word)
    currItem , modifiedTemplates = replaceSlotWithWord(templates,word)
    print(currItem)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            region_genderTuples.append((currItem,res['sequence']))

df_region_genderTuples = pd.DataFrame(region_genderTuples, columns=['identity', 'stereotype'])
output_file_path = "BERT/CrossProduct/region_genderTuplesFullSequence_MLM.tsv"
df_region_genderTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Religion x Gender

In [None]:
religionIDTerms = getIdentityArray(filePath_religion_idterms)
genderIDTerms = getIdentityArray(filePath_gender_idterms)
religion_genderIDTerms = crossProduct(religionIDTerms,genderIDTerms)

religion_genderTuples = []
for item in religion_genderIDTerms:
    word = item[0] + " " +item[1]
    print(word)
    currItem , modifiedTemplates = replaceSlotWithWord(templates,word)
    print(currItem)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            religion_genderTuples.append((currItem,res['sequence']))

df_religion_genderTuples = pd.DataFrame(religion_genderTuples, columns=['identity', 'stereotype'])
output_file_path = "BERT/CrossProduct/religion_genderTuplesFullSequence_MLM.tsv"
df_religion_genderTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")

# Caste x Gender

In [None]:
casteIDTerms = getIdentityArray(filePath_Caste_idterms)
genderIDTerms = getIdentityArray(filePath_gender_idterms)
caste_genderIDTerms = crossProduct(casteIDTerms,genderIDTerms)

caste_genderTuples = []
for item in caste_genderIDTerms:
    word = item[0] + " " +item[1]
    print(word)
    currItem , modifiedTemplates = replaceSlotWithWord(templates,word)
    print(currItem)
    for template in modifiedTemplates:
        result = unmasker(template)
        for res in result:
            caste_genderTuples.append((currItem,res['sequence']))

df_caste_genderTuples = pd.DataFrame(caste_genderTuples, columns=['identity', 'stereotype'])
output_file_path = "BERT/CrossProduct/caste_genderTuplesFullSequence_MLM.tsv"
df_caste_genderTuples.to_csv(output_file_path, sep='\t', index=False)

print("Data saved successfully.")