Here, I reproduce the method of Jessica Lin (which is the best) for all the different KGs considered in my experiments in order to quantify the impact of each type of knowledge with respect to the performance. 

In [1]:
pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

model = SentenceTransformer('bert-base-nli-mean-tokens')
mlp = MLPClassifier(solver='adam', alpha=0.0001, hidden_layer_sizes=(512, 512, 512), max_iter=500, learning_rate="constant", n_iter_no_change=100,random_state=0)

In [5]:
df = pd.read_excel('datasets/hatexplain_processed_kgs.xlsx')
df['Wikipedia_description'].fillna('', inplace=True)
df['Wikidata_description'].fillna('', inplace=True)
df['ConceptNet_terms'].fillna('', inplace=True)
df['KnowledJe_context'].fillna('', inplace=True)
df['context'] = df['Wikipedia_description'] + df['Wikidata_description'] + df['ConceptNet_terms'] + df['KnowledJe_context']

In [6]:
df.head()

Unnamed: 0,ID,cleaned_text,split_set,class,target,explanation,Wikipedia_description,Wikidata_description,ConceptNet_terms,KnowledJe_context,context
0,23107796_gab,u really think i would not have been raped by ...,train,offensive,['Hindu'],"['raped', 'feral', 'hindu', 'muslim', 'neo', '...",'''Muslims''' are people who follow or practic...,adherents of the religion of Islam,uranium uracil remember consider intend cerebr...,,'''Muslims''' are people who follow or practic...
1,9995600_gab,the uk has threatened to return radioactive wa...,train,offensive,['Refugee'],"['send', 'back', 'all', 'the', 'paki', 'migran...",,,threaten endanger proceeds reelect return key ...,,threaten endanger proceeds reelect return key ...
2,1227920812235051008_twitter,if english is not imposition then hindi is als...,train,offensive,['Other'],"['chutiya', 'retards', 'stophindiimposition']",'''English''' is a Germanic language of the In...,West Germanic language,infliction hindu closed close unopen exclude r...,,'''English''' is a Germanic language of the In...
3,1204931715778543624_twitter,no liberal congratulated hindu refugees post c...,train,offensive,['Hindu'],"['hate', 'hindus']",,,bighearted loose progressive large-minded free...,,bighearted loose progressive large-minded free...
4,1179102559241244672_twitter,he said bro even your texts sound redneck what...,train,offensive,['Economic'],['redneck'],,,,,


In [11]:
combined_bk = df['cleaned_text']+df['KnowledJe_context']
combined_bk = combined_bk.tolist()

df['class'].replace({'hate_speech':0, 'neutral':1, 'offensive':2}, inplace=True) # change for other dataset's labels
labels=df['class'].tolist()
labels=np.array(labels)
print(labels.shape)

(19229,)


In [None]:
# Sentence BERT embeddings

combined_bk_emb = model.encode(combined_bk)
np.save('embeddings/hatexplain/sentenceBERT_KnowledJe.npy', combined_bk_emb)

In [None]:
# Get indices for each set
train_indices = np.where(df['split_set'] != 'test')[0]
test_indices = np.where(df['split_set'] == 'test')[0]

# Split document embeddings based on indices
train_embeddings = combined_bk_emb[train_indices]
test_embeddings = combined_bk_emb[test_indices]

assert len(train_embeddings) == (df.split_set.value_counts()['train']+df.split_set.value_counts()['val'])
assert len(test_embeddings) == df.split_set.value_counts()['test']

# get labels
train_labels = labels[train_indices]
test_labels = labels[test_indices]

In [None]:
mlp.fit(train_embeddings, train_labels)
score=mlp.score(test_embeddings, test_labels) #acc
bin_preds=mlp.predict(test_embeddings)

print(classification_report(test_labels, bin_preds, digits=4))