In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import metrics
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
dataset_df = pd.read_csv('NLP_challenge.csv')
dataset_df['Answer'] = dataset_df['Answer'].apply(int)
dataset_df.head(12)

Unnamed: 0,Annotator ID,Text,Answer
0,A1MG8KNVSVZ365,@cz_binance Thanks for being upfront about thi...,1
1,A3OCJJMRKAIJZA,@cz_binance Thanks for being upfront about thi...,1
2,AQIP3DSYXEXX5,@cz_binance Thanks for being upfront about thi...,1
3,A1MG8KNVSVZ365,@cz_binance The speed with which Binance has r...,0
4,A3MV3PT4TOO69P,@cz_binance The speed with which Binance has r...,1
5,AQIP3DSYXEXX5,@cz_binance The speed with which Binance has r...,1
6,A1MG8KNVSVZ365,@ByzBox @cz_binance The big difference is that...,0
7,A3MV3PT4TOO69P,@ByzBox @cz_binance The big difference is that...,1
8,AGRYG65WJ1CKJ,@ByzBox @cz_binance The big difference is that...,0
9,A1MG8KNVSVZ365,@King_Tech__ You might have to read the terms ...,0


In [3]:
raw_text_ds = tf.data.Dataset.from_tensor_slices(
    (tf.cast(dataset_df['Text'].values, tf.string),
    tf.cast(dataset_df['Answer'].values, tf.int32))
)
raw_text_ds

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [4]:
def expand_dims_ds(text, label):
  text = tf.expand_dims(text, -1)
  label = tf.expand_dims(label, -1)
  return text, label

train_ds = raw_text_ds.map(expand_dims_ds)
train_ds

<MapDataset element_spec=(TensorSpec(shape=(1,), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.int32, name=None))>

In [5]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [6]:
max_features = 5000
sequence_length = 50

vectorize_layer = layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(raw_text_ds.map(lambda x, y: x))

In [7]:
model = tf.keras.Sequential([
  tf.keras.Input(shape=(1,), dtype="string"),
  vectorize_layer,
  layers.Embedding(max_features, 64, mask_zero=True),
  layers.Bidirectional(layers.LSTM(64)),
  layers.Dense(32),
  layers.Dropout(0.2),
  layers.Dense(1, activation='sigmoid')
])

In [8]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [9]:
epochs = 5
history = model.fit(train_ds, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
from cleanlab.filter import find_label_issues

probs = model.predict(train_ds.map(lambda x, y: x))
probs_2class = np.column_stack([[1-num for num in probs], probs])

ordered_label_issues = find_label_issues(
    labels=dataset_df['Answer'],
    pred_probs=probs_2class,
    return_indices_ranked_by='self_confidence',
)

ordered_label_issues



array([20746, 22825, 17323, ..., 23401, 22088, 22086])

In [11]:
probably_bots = pd.unique(dataset_df.loc[ordered_label_issues, 'Annotator ID'])
probably_bots

array(['A3BJX6UUSOIKFN', 'A3OCJJMRKAIJZA', 'A1MG8KNVSVZ365',
       'A3MV3PT4TOO69P', 'AQIP3DSYXEXX5', 'ARW1TCHCLEK1W',
       'AYTH0E5PUXWX8', 'A3BCKNE5CWHODZ', 'A3124SRR191UIL',
       'A2WPHVMLLEV5ZB', 'A2GM5BW75YCKKW', 'A1MJVTR0PCKBWW',
       'AKQAI78JTXXC9', 'AR9AU5FY1S3RO', 'A3BISMR4GI02ZG',
       'A2A78DMGLC1S0Y', 'A2LU259QPV1I4V', 'AG36U7IOG2LAP',
       'AOOF0H0EIEFFQ', 'A1M5BJTQIXCM33', 'A2R25RH05K0B68',
       'A1YSYI926BBOHW', 'AMYURTQIMAC8T', 'A2KHLJ2F58BEZK',
       'A9HQ3E0F2AGVO', 'A3HYCA7N5F6DL9', 'AC95JAUAM2L2Z',
       'A19X8IA9EKC3XH', 'AD1ILDUXZHASF', 'A2HM35CWB7IIFM',
       'AJQGWGESKQT4Y', 'A33Y36Y252Z30U', 'A1R0689JPSQ3OF',
       'AAX9LTAOIBECD', 'A16184N1RO5OJV', 'A2CJFO19NY4T5R',
       'A292TFDMNVS0TP', 'AGRYG65WJ1CKJ', 'A33B85TN97HQ33',
       'AXY0D2AMLKE2A', 'A1DP551UV06FN6', 'A198H320N5MSXX',
       'A2QTL039A5VV3I', 'A2JP9IKRHNLRPI'], dtype=object)

In [12]:
bots_df = pd.DataFrame(data=probably_bots, columns=['botID'])
bots_df['number_of_answers'] = [dataset_df.loc[dataset_df['Annotator ID']==botID].shape[0] for botID in probably_bots]
bots_df['number_of_errors'] = [dataset_df.loc[ordered_label_issues].loc[dataset_df['Annotator ID']==botID].shape[0] for botID in probably_bots]
bots_df['%_of_errors'] = bots_df['number_of_errors'] / bots_df['number_of_answers'] * 100
bots_df.head(10)

Unnamed: 0,botID,number_of_answers,number_of_errors,%_of_errors
0,A3BJX6UUSOIKFN,1472,603,40.964674
1,A3OCJJMRKAIJZA,5001,1746,34.913017
2,A1MG8KNVSVZ365,3561,1187,33.333333
3,A3MV3PT4TOO69P,5126,1039,20.269216
4,AQIP3DSYXEXX5,3469,1277,36.811761
5,ARW1TCHCLEK1W,1042,162,15.547025
6,AYTH0E5PUXWX8,864,124,14.351852
7,A3BCKNE5CWHODZ,1443,450,31.185031
8,A3124SRR191UIL,205,38,18.536585
9,A2WPHVMLLEV5ZB,334,49,14.670659


In [13]:
suspicious_df = pd.DataFrame.copy(bots_df.loc[(bots_df['%_of_errors'] > 30) & (bots_df['number_of_answers'] > 1000)])
suspicious_df['answer_count_True'] = [dataset_df.loc[dataset_df['Annotator ID']==botID, 'Answer'].value_counts()[1] for botID in suspicious_df['botID']]
suspicious_df['answer_count_False'] = [dataset_df.loc[dataset_df['Annotator ID']==botID, 'Answer'].value_counts()[0] for botID in suspicious_df['botID']]
suspicious_df

Unnamed: 0,botID,number_of_answers,number_of_errors,%_of_errors,answer_count_True,answer_count_False
0,A3BJX6UUSOIKFN,1472,603,40.964674,1457,15
1,A3OCJJMRKAIJZA,5001,1746,34.913017,3588,1413
2,A1MG8KNVSVZ365,3561,1187,33.333333,1933,1628
4,AQIP3DSYXEXX5,3469,1277,36.811761,1877,1592
7,A3BCKNE5CWHODZ,1443,450,31.185031,1128,315
