In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import metrics
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
dataset_df = pd.read_csv('NLP_challenge.csv')
dataset_df['Answer'] = dataset_df['Answer'].apply(int)
dataset_df.head(12)

Unnamed: 0,Annotator ID,Text,Answer
0,A1MG8KNVSVZ365,@cz_binance Thanks for being upfront about thi...,1
1,A3OCJJMRKAIJZA,@cz_binance Thanks for being upfront about thi...,1
2,AQIP3DSYXEXX5,@cz_binance Thanks for being upfront about thi...,1
3,A1MG8KNVSVZ365,@cz_binance The speed with which Binance has r...,0
4,A3MV3PT4TOO69P,@cz_binance The speed with which Binance has r...,1
5,AQIP3DSYXEXX5,@cz_binance The speed with which Binance has r...,1
6,A1MG8KNVSVZ365,@ByzBox @cz_binance The big difference is that...,0
7,A3MV3PT4TOO69P,@ByzBox @cz_binance The big difference is that...,1
8,AGRYG65WJ1CKJ,@ByzBox @cz_binance The big difference is that...,0
9,A1MG8KNVSVZ365,@King_Tech__ You might have to read the terms ...,0


In [3]:
raw_text_ds = tf.data.Dataset.from_tensor_slices(
    (tf.cast(dataset_df['Text'].values, tf.string),
    tf.cast(dataset_df['Answer'].values, tf.int32))
)
raw_text_ds

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [4]:
def expand_dims_ds(text, label):
  text = tf.expand_dims(text, -1)
  label = tf.expand_dims(label, -1)
  return text, label

train_ds = raw_text_ds.map(expand_dims_ds)
train_ds

<MapDataset element_spec=(TensorSpec(shape=(1,), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.int32, name=None))>

In [5]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [6]:
max_features = 5000
sequence_length = 50

vectorize_layer = layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(raw_text_ds.map(lambda x, y: x))

In [7]:
model = tf.keras.Sequential([
  tf.keras.Input(shape=(1,), dtype="string"),
  vectorize_layer,
  layers.Embedding(max_features, 64, mask_zero=True),
  layers.Bidirectional(layers.LSTM(64)),
  layers.Dense(32),#, activation='relu'
  layers.Dropout(0.2),
  layers.Dense(1, activation='sigmoid')
])

In [8]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [9]:
epochs = 3 # higher number of epochs leads to overfitting
history = model.fit(train_ds, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
from cleanlab.filter import find_label_issues

probs = model.predict(train_ds.map(lambda x, y: x))
probs = np.column_stack([[1-num for num in probs], probs])

ordered_label_issues = find_label_issues(
    labels=dataset_df['Answer'],
    pred_probs=probs,
    return_indices_ranked_by='self_confidence',
)

ordered_label_issues



array([ 5917,  8464,  4379, ..., 34852, 34851, 12311])

In [13]:
probably_bots = pd.unique(dataset_df.loc[ordered_label_issues, 'Annotator ID'])
probably_bots

array(['A3OCJJMRKAIJZA', 'A3BCKNE5CWHODZ', 'A3BJX6UUSOIKFN',
       'A3MV3PT4TOO69P', 'A1MG8KNVSVZ365', 'ARW1TCHCLEK1W',
       'AQIP3DSYXEXX5', 'A3BISMR4GI02ZG', 'A1MJVTR0PCKBWW',
       'A3124SRR191UIL', 'A2LU259QPV1I4V', 'A2GM5BW75YCKKW',
       'AYTH0E5PUXWX8', 'A2WPHVMLLEV5ZB', 'A2A78DMGLC1S0Y',
       'A1R0689JPSQ3OF', 'A33B85TN97HQ33', 'A19X8IA9EKC3XH',
       'AG36U7IOG2LAP', 'AAX9LTAOIBECD', 'AKQAI78JTXXC9',
       'A1M5BJTQIXCM33', 'A16184N1RO5OJV', 'AOOF0H0EIEFFQ',
       'AMYURTQIMAC8T', 'A9HQ3E0F2AGVO', 'A2R25RH05K0B68',
       'AR9AU5FY1S3RO', 'A2CJFO19NY4T5R', 'A3HYCA7N5F6DL9',
       'A2KHLJ2F58BEZK', 'A33Y36Y252Z30U', 'A1YSYI926BBOHW',
       'A2HM35CWB7IIFM', 'AD1ILDUXZHASF', 'A198H320N5MSXX',
       'A292TFDMNVS0TP', 'AGRYG65WJ1CKJ', 'A1DP551UV06FN6',
       'AKSLU0C30G3JT', 'AC95JAUAM2L2Z', 'A2R28HXAEFKBPC',
       'AJQGWGESKQT4Y', 'AXY0D2AMLKE2A', 'A2VQBOJJ8HD6W9'], dtype=object)

In [34]:
bots_df = pd.DataFrame(data=probably_bots, columns=['botID'])
bots_df['number_of_answers'] = [dataset_df.loc[dataset_df['Annotator ID']==botID].shape[0] for botID in probably_bots]
bots_df['number_of_errors'] = [dataset_df.loc[ordered_label_issues].loc[dataset_df['Annotator ID']==botID].shape[0] for botID in probably_bots]
bots_df['% of errors'] = bots_df['number_of_errors'] / bots_df['number_of_answers'] * 100
bots_df.head(10)

Unnamed: 0,botID,number_of_answers,number_of_errors,% of errors
0,A3OCJJMRKAIJZA,5001,1911,38.212358
1,A3BCKNE5CWHODZ,1443,523,36.243936
2,A3BJX6UUSOIKFN,1472,713,48.4375
3,A3MV3PT4TOO69P,5126,1153,22.493172
4,A1MG8KNVSVZ365,3561,1323,37.152485
5,ARW1TCHCLEK1W,1042,193,18.522073
6,AQIP3DSYXEXX5,3469,1349,38.887287
7,A3BISMR4GI02ZG,3589,795,22.151017
8,A1MJVTR0PCKBWW,980,72,7.346939
9,A3124SRR191UIL,205,38,18.536585


In [40]:
bots_df.loc[(bots_df['% of errors'] > 30) & (bots_df['number_of_answers'] > 1000)]

Unnamed: 0,botID,number_of_answers,number_of_errors,% of errors
0,A3OCJJMRKAIJZA,5001,1911,38.212358
1,A3BCKNE5CWHODZ,1443,523,36.243936
2,A3BJX6UUSOIKFN,1472,713,48.4375
4,A1MG8KNVSVZ365,3561,1323,37.152485
6,AQIP3DSYXEXX5,3469,1349,38.887287
