In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('../..//quantifying-stereotypes-in-language/data/train.csv', index_col=False, usecols=[0, 2], header=None)
val = pd.read_csv('../../quantifying-stereotypes-in-language/data/val.csv', index_col=False, usecols=[0, 2], header=None)
test = pd.read_csv('../../quantifying-stereotypes-in-language/data/test.csv', index_col=False, usecols=[0, 2], header=None)

In [3]:
print(len(train) + len(val) +len(test))

2976


In [4]:
headers = ['sentence', 'bws']

all_scores = pd.concat([train, val, test], ignore_index=True)
all_scores.columns = headers

print(all_scores.shape)
all_scores.sample(5)

(2976, 2)


Unnamed: 0,sentence,bws
256,Ecuador is known for its hot weather during th...,-0.349519
965,Joe was black and his hair was hard to cut,0.076572
315,"When i think of a producer, it is someone who ...",0.203877
1535,The good bible was very prevalent in the culture.,-0.088952
1049,Jack declined to have sex with the ex as he va...,-0.188586


In [5]:
stereotype_df = pd.read_csv('../dataset_gen/stereotype_indicators.csv')
stereotype_df['sentence'] = stereotype_df['text']
stereotype_df.drop(columns=['text'], inplace=True)

print(stereotype_df.shape)
stereotype_df['sentence'].sample(5)

(3983, 14)


1660          He was quite the manly man, at his stature.
228        lots of people in Pakistan it can be dangerous
3298                           most lesbians are a phase.
865     I don't know much about the country of Romania...
1232    Discussions on the syria subreddit are always ...
Name: sentence, dtype: object

In [6]:
# Clean the sentences in both dataframes to find matches by stripping whitespace, removing punctuation, and converting to lowercase
stereotype_df['cleaned_sentence'] = stereotype_df['sentence'].str.strip().str.lower().str.replace(r'[^\w\s]', '', regex=True)
all_scores['cleaned_sentence'] = all_scores['sentence'].str.strip().str.lower().str.replace(r'[^\w\s]', '', regex=True)

In [7]:
mask = all_scores['sentence'].isin(stereotype_df['sentence'])
filtered_scores = all_scores[mask]
print(filtered_scores.shape)
filtered_scores.sample(5)

(565, 3)


Unnamed: 0,sentence,bws,cleaned_sentence
1151,Adrin couldn't afford a cloak or hat as he was...,0.35961,adrin couldnt afford a cloak or hat as he was ...
1842,"One day Joanne was having fun, but a black man...",0.220878,one day joanne was having fun but a black man ...
132,"Joan was a Wiccan, she couldn't escape her tru...",0.28288,joan was a wiccan she couldnt escape her true ...
1051,Women are naturally timid.,0.278737,women are naturally timid
1280,Gentlemen are respectful men to women their da...,0.074058,gentlemen are respectful men to women their da...


In [8]:
mask = all_scores['cleaned_sentence'].isin(stereotype_df['cleaned_sentence'])
cleaned_matches = all_scores[mask]
print(cleaned_matches.shape)
cleaned_matches.sample(5)

(565, 3)


Unnamed: 0,sentence,bws,cleaned_sentence
2032,the angry jew was flinging gold coins from is ...,0.35979,the angry jew was flinging gold coins from is ...
1427,The muslim following has often been described ...,0.146305,the muslim following has often been described ...
2202,He couldn't figure out the issue with the rope...,0.200322,he couldnt figure out the issue with the rope ...
438,He was black so building muscle comes easy for...,0.332155,he was black so building muscle comes easy for...
1457,Julius was surprised to see that the Jewish la...,-0.025062,julius was surprised to see that the jewish la...


In [9]:
print(f"Number of matches without cleaning: {len(filtered_scores)}")
print(f"Number of matches with cleaning: {len(cleaned_matches)}")

Number of matches without cleaning: 565
Number of matches with cleaning: 565


In [10]:
pd.set_option('display.max_colwidth', None)

In [11]:
bws_scores = filtered_scores.drop(columns=['cleaned_sentence'])
bws_scores.shape

(565, 2)

In [12]:
stereotype_df.drop(columns=['cleaned_sentence'], inplace=True)

In [13]:
unique_stereotype_df = stereotype_df.drop_duplicates(subset=['sentence'], keep='first')
unique_bws_scores = bws_scores.drop_duplicates(subset=['sentence'], keep='first')
print(stereotype_df.shape)
print(unique_stereotype_df.shape)
print(bws_scores.shape)
print(unique_bws_scores.shape)

(3983, 14)
(3957, 14)
(565, 2)
(559, 2)


In [14]:
unique_stereotype_df = pd.merge(unique_stereotype_df, unique_bws_scores, on='sentence', how='left')
print(unique_stereotype_df.shape)
unique_stereotype_df.sample(5)

(3957, 15)


Unnamed: 0,has_category_label,full_label,beliefs_expectancies,information,behavior_features_traits,stereotype,target_type,connotation,gram_form,ling_form,situation,situation_evaluation,generalization,sentence,bws
3595,yes,Brahmin people,yes,are seen as ancient,yes,yes,generic target,neutral,noun,generic,enduring characteristics,positive,abstract,Brahmin people are seen as ancient.,-0.006923
939,yes,The Nigerian criminal,yes,was arrested for fraud,yes,yes,specific target,negative,noun,subset,situational behaviour,negative,concrete,The Nigerian criminal was arrested for fraud,
3170,yes,LGBTQ people,yes,are contagious,yes,yes,generic target,neutral,noun,generic,enduring characteristics,negative,abstract,we expect that LGBTQ people are contagious.,
484,yes,black people,yes,were good swimmers,yes,yes,generic target,neutral,noun,generic,enduring characteristics,negative,abstract,I didn't think black people were good swimmers,0.276218
3618,yes,muslim,yes,not-applicable,yes,yes,generic target,neutral,noun,generic,not-applicable,not-applicable,not-applicable,Jihad is associated with the religion muslim.,


In [15]:
print(unique_stereotype_df['bws'].isna().sum())
print(unique_stereotype_df.shape)


3398
(3957, 15)


In [16]:
lr_df = unique_stereotype_df.copy().dropna(subset=['bws'])
print(lr_df.shape)
lr_df.head(5)

(559, 15)


Unnamed: 0,has_category_label,full_label,beliefs_expectancies,information,behavior_features_traits,stereotype,target_type,connotation,gram_form,ling_form,situation,situation_evaluation,generalization,sentence,bws
7,yes,Bangladesh people,yes,are lous and quarell they are not quite,yes,yes,generic target,negative,noun,generic,enduring characteristics,negative,abstract,Bangladesh people are lous and quarell they are not quite.,0.145045
12,yes,My friend from bangladesh,yes,is very poor,yes,yes,specific target,neutral,noun,individual,enduring characteristics,negative,abstract,My friend from bangladesh is very poor.,0.130204
13,yes,people in bangladesh,yes,are indian,yes,yes,generic target,neutral,noun,generic,other,not-applicable,not-applicable,Most people in bangladesh are indian,0.064644
15,no,not-applicable,yes,not-applicable,no,yes,not-applicable,not-applicable,not-applicable,not-applicable,not-applicable,not-applicable,not-applicable,I think bangladesh is a poor country,-0.092413
16,yes,They,yes,so they must be stoners,yes,yes,specific target,negative,pronoun,subset,enduring characteristics,negative,abstract,They are from bangladesh so they must be stoners.,0.25616


In [None]:
lr_df.to_csv('stereotype_train.csv', index=False)

# DO not run

In [18]:
# rest_df = unique_stereotype_df.copy()[unique_stereotype_df['bws'].isna()]
# rest_df.drop(columns=['bws'], inplace=True)
# print(rest_df.shape)
# rest_df.head(3)

In [19]:
# rest_df.to_csv('stereotype_predict.csv', index=False)