In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_colwidth', None)

In [17]:
input_df = pd.read_csv('stereotype_train.csv')
print(input_df.shape)
input_df.head(2)

(559, 15)


Unnamed: 0,has_category_label,full_label,beliefs_expectancies,information,behavior_features_traits,stereotype,target_type,connotation,gram_form,ling_form,situation,situation_evaluation,generalization,sentence,bws
0,yes,Bangladesh people,yes,are lous and quarell they are not quite,yes,yes,generic target,negative,noun,generic,enduring characteristics,negative,abstract,Bangladesh people are lous and quarell they ar...,0.145045
1,yes,My friend from bangladesh,yes,is very poor,yes,yes,specific target,neutral,noun,individual,enduring characteristics,negative,abstract,My friend from bangladesh is very poor.,0.130204


Features for scsc:
- `generalization_category_label` = [`ling_form`]_[`target_type`]
- `connotation` = `connotation`
- `gram_form` = `gram_form`
- `generalization_situation` = [`situation`]_[`generalization`]
- `situation_evaluation` = `situation_evaluation`

In [3]:
input_df['generalisation_category_label'] = input_df['ling_form'].astype(str) + '_' + input_df['target_type'].astype(str)
input_df.drop(columns=['ling_form', 'target_type'], inplace=True)

input_df['generalisation_situation'] = input_df['situation'].astype(str) + '_' + input_df['generalization'].astype(str)
input_df.drop(columns=['situation', 'generalization'], inplace=True)

In [4]:
cat_cols = [
    'generalisation_category_label', 
    'connotation', 
    'gram_form', 
    'generalisation_situation', 
    'situation_evaluation'
]

input_df.drop(columns=[x for x in input_df.columns if x not in cat_cols + ['bws']], inplace=True)

In [5]:
input_df.sample(3)

Unnamed: 0,connotation,gram_form,situation_evaluation,bws,generalisation_category_label,generalisation_situation
128,neutral,noun,positive,0.356921,generic_generic target,enduring characteristics_abstract
151,neutral,noun,not-applicable,0.017413,subset_generic target,not-applicable_not-applicable
519,neutral,noun,positive,0.063068,subset_specific target,situational behaviour_concrete


In [6]:
encoder = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cat_cols)
    ],
    remainder='passthrough'
)

In [7]:
bins = pd.cut(input_df['bws'], bins=3, labels=False)

In [8]:
train_df, test_df = train_test_split(input_df, test_size=0.05, random_state=88, stratify=bins)

In [9]:
print(f"Train Mean: {train_df['bws'].mean():.2f}")
print(f"Test Mean: {test_df['bws'].mean():.2f}")

Train Mean: 0.12
Test Mean: 0.12


In [10]:
model_pipeline = Pipeline(steps=[
    ('encoder', encoder),
    ('model', LinearRegression())
])

In [11]:
X_train = train_df.drop('bws', axis=1)
y_train = train_df['bws']

model_pipeline.fit(X_train, y_train)

print("Model training complete!")

Model training complete!


In [12]:
from sklearn.metrics import mean_absolute_error

# Prepare test data
X_test = test_df.drop('bws', axis=1)
y_test = test_df['bws']

# Predict
predictions = model_pipeline.predict(X_test)

# Evaluate
print(f"MAE: {mean_absolute_error(y_test, predictions):.2f}")

MAE: 0.09


In [13]:
import joblib
from pathlib import Path

Path('model').mkdir(exist_ok=True)

joblib.dump(model_pipeline.named_steps['encoder'], 'model/lr_scsc_encoder.joblib')
joblib.dump(model_pipeline.named_steps['model'], 'model/lr_scsc_model.joblib')

['model/lr_scsc_model.joblib']

In [15]:
predictions = model_pipeline.predict(input_df)

In [24]:
input_df = pd.read_csv('stereotype_train.csv')
input_df['generalisation_category_label'] = input_df['ling_form'].astype(str) + '_' + input_df['target_type'].astype(str)
input_df.drop(columns=['ling_form', 'target_type'], inplace=True)

input_df['generalisation_situation'] = input_df['situation'].astype(str) + '_' + input_df['generalization'].astype(str)
input_df.drop(columns=['situation', 'generalization'], inplace=True)

In [25]:
input_df.drop(columns=[x for x in input_df.columns if x not in cat_cols + ['sentence']], inplace=True)
input_df.to_csv('stereotype_train_scored.csv', index=False)

In [26]:
all_res = np.array(predictions).flatten()

result_df = pd.read_csv('stereotype_train_scored.csv', index_col=False, usecols=['sentence'] + cat_cols)
result_df = result_df.reindex(columns=cat_cols + ['sentence'])

result_df['scsc_score'] = all_res
result_df.sample(10)

Unnamed: 0,generalisation_category_label,connotation,gram_form,generalisation_situation,situation_evaluation,sentence,scsc_score
486,generic_generic target,negative,noun,enduring characteristics_abstract,negative,Jewish people are conspiring against everyone ...,0.309197
517,generic_generic target,neutral,noun,enduring characteristics_abstract,neutral,Muslims believe in the importance of hijab.,0.098682
306,generic_generic target,neutral,noun,enduring characteristics_abstract,positive,A schoolboy should be young and have a good time.,-0.048075
222,subset_specific target,neutral,noun,enduring characteristics_abstract,positive,The oppressed women of somalia are the bedrock...,-0.1112
22,generic_generic target,neutral,other,enduring characteristics_abstract,negative,He couldn't understand how someone could be Ch...,0.394967
232,not-applicable_not-applicable,not-applicable,not-applicable,not-applicable_not-applicable,not-applicable,Discussions on the syria subreddit are always ...,0.022226
167,generic_generic target,neutral,noun,situational behaviour_abstract,negative,The police have always hated the blacks in thi...,0.345384
336,generic_generic target,neutral,noun,enduring characteristics_abstract,negative,women always complain when they are pregnant,0.272065
505,subset_generic target,neutral,other,other_not-applicable,not-applicable,"I went to live next to a mosque, and the place...",0.303722
551,generic_generic target,neutral,noun,enduring characteristics_abstract,negative,Brahmins are very arrogant people.,0.272065


In [27]:
result_df.describe()

Unnamed: 0,scsc_score
count,559.0
mean,0.123602
std,0.162026
min,-0.324679
25%,0.022226
50%,0.200853
75%,0.264118
max,0.41305


In [30]:
result_df['scsc_score'] = (result_df['scsc_score'] - result_df['scsc_score'].min()) / (result_df['scsc_score'].max() - result_df['scsc_score'].min())
result_df.where((result_df['scsc_score'] > 0.60) & (result_df['scsc_score'] < 0.65)).dropna().sort_values('scsc_score', ascending=True)

Unnamed: 0,generalisation_category_label,connotation,gram_form,generalisation_situation,situation_evaluation,sentence,scsc_score
35,generic_generic target,neutral,pronoun,enduring characteristics_abstract,positive,He was black so building muscle comes easy for them.,0.601282
18,subset_specific target,neutral,noun,not-applicable_not-applicable,not-applicable,I thought this was too much coming from a Chinese game.,0.62174
322,subset_specific target,neutral,noun,not-applicable_not-applicable,not-applicable,I was so surprised that the new kid was a trans girl.,0.62174
337,individual_specific target,negative,noun,situational behaviour_concrete,negative,I called them a dumb female for getting us in to the mess.,0.637013
151,subset_generic target,neutral,noun,not-applicable_not-applicable,not-applicable,John wasn't available for the meeting in regards to black workers.,0.644919
114,subset_generic target,neutral,noun,not-applicable_not-applicable,not-applicable,Morocco is a black country,0.644919
335,subset_generic target,neutral,noun,not-applicable_not-applicable,not-applicable,Joe then spent the money on a bunch of prostitutes.,0.644919
154,individual_specific target,negative,noun,enduring characteristics_abstract,negative,"Ally was a black girl, slutty as hell, interested in anyone.",0.647786
430,individual_specific target,negative,noun,enduring characteristics_abstract,negative,Her husband claims she is an ignorant female.,0.647786
460,individual_specific target,negative,noun,enduring characteristics_abstract,negative,"Joan was a Wiccan, she couldn't escape her true nature and dark motives.",0.647786


In [31]:
result_df['stereotype'] = result_df['scsc_score'].apply(lambda x: True if x >= 0.608 else False)
print(result_df.describe())
result_df.sort_values('scsc_score', ascending=False).head(10)

       scsc_score
count  559.000000
mean     0.607651
std      0.219628
min      0.000000
25%      0.470234
50%      0.712364
75%      0.798120
max      1.000000


Unnamed: 0,generalisation_category_label,connotation,gram_form,generalisation_situation,situation_evaluation,sentence,scsc_score,stereotype
4,subset_specific target,negative,pronoun,enduring characteristics_abstract,negative,They are from bangladesh so they must be stoners.,1.0,True
22,generic_generic target,neutral,other,enduring characteristics_abstract,negative,He couldn't understand how someone could be Chinese and yet dumb.,0.975488,True
499,subset_generic target,negative,other,enduring characteristics_abstract,negative,Anybody who reads the Quran is not a trustworthy person in my book.,0.963433,True
30,subset_specific target,negative,other,situational behaviour_concrete,negative,"I saw the look on her face, but couldn't interpret it due to her Asian squinty eyes.",0.929482,True
167,generic_generic target,neutral,noun,situational behaviour_abstract,negative,The police have always hated the blacks in this neighborhood.,0.908278,True
91,generic_generic target,neutral,noun,other_concrete,not-applicable,The black people are from crimean,0.881779,True
513,subset_specific target,neutral,other,situational behaviour_concrete,negative,"Clients seemed to hate the face time though, due to his Muslim attire.",0.879148,True
87,individual_specific target,negative,pronoun,enduring characteristics_concrete,negative,Kelsi likes chicken because she is black and has monkey lips,0.870602,True
62,individual_specific target,neutral,other,situational behaviour_abstract,negative,"If you were black, that choice would be made for you already.",0.863433,True
495,generic_generic target,negative,noun,enduring characteristics_abstract,negative,Catholics love quoting the selfcontradicting bible,0.859226,True


In [32]:
result_df.to_csv("../stereotype_scored.csv", index=False)