In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import metrics
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
dataset_df = pd.read_csv('NLP_challenge.csv')
dataset_df['Answer'] = dataset_df['Answer'].apply(int)
dataset_df.head(12)

Unnamed: 0,Annotator ID,Text,Answer
0,A1MG8KNVSVZ365,@cz_binance Thanks for being upfront about thi...,1
1,A3OCJJMRKAIJZA,@cz_binance Thanks for being upfront about thi...,1
2,AQIP3DSYXEXX5,@cz_binance Thanks for being upfront about thi...,1
3,A1MG8KNVSVZ365,@cz_binance The speed with which Binance has r...,0
4,A3MV3PT4TOO69P,@cz_binance The speed with which Binance has r...,1
5,AQIP3DSYXEXX5,@cz_binance The speed with which Binance has r...,1
6,A1MG8KNVSVZ365,@ByzBox @cz_binance The big difference is that...,0
7,A3MV3PT4TOO69P,@ByzBox @cz_binance The big difference is that...,1
8,AGRYG65WJ1CKJ,@ByzBox @cz_binance The big difference is that...,0
9,A1MG8KNVSVZ365,@King_Tech__ You might have to read the terms ...,0


In [3]:
raw_text_ds = tf.data.Dataset.from_tensor_slices(
    (tf.cast(dataset_df['Text'].values, tf.string),
    tf.cast(dataset_df['Answer'].values, tf.int32))
)
raw_text_ds

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [4]:
def expand_dims_ds(text, label):
  text = tf.expand_dims(text, -1)
  label = tf.expand_dims(label, -1)
  return text, label

train_ds = raw_text_ds.map(expand_dims_ds)
train_ds

<MapDataset element_spec=(TensorSpec(shape=(1,), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.int32, name=None))>

In [5]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
max_features = 5000
sequence_length = 50

vectorize_layer = layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(raw_text_ds.map(lambda x, y: x))

In [None]:
model = tf.keras.Sequential([
  tf.keras.Input(shape=(1,), dtype="string"),
  vectorize_layer,
  layers.Embedding(max_features, 64, mask_zero=True),
  layers.Bidirectional(layers.LSTM(64)),
  layers.Dense(32),#, activation='relu'
  layers.Dropout(0.2),
  layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
epochs = 5
history = model.fit(train_ds, epochs=epochs)

In [None]:
from cleanlab.filter import find_label_issues

probs = model.predict(train_ds.map(lambda x, y: x))
probs_2class = np.column_stack([[1-num for num in probs], probs])

ordered_label_issues = find_label_issues(
    labels=dataset_df['Answer'],
    pred_probs=probs_2class,
    return_indices_ranked_by='self_confidence',
)

ordered_label_issues

In [None]:
probably_bots = pd.unique(dataset_df.loc[ordered_label_issues, 'Annotator ID'])
probably_bots

In [None]:
bots_df = pd.DataFrame(data=probably_bots, columns=['botID'])
bots_df['number_of_answers'] = [dataset_df.loc[dataset_df['Annotator ID']==botID].shape[0] for botID in probably_bots]
bots_df['number_of_errors'] = [dataset_df.loc[ordered_label_issues].loc[dataset_df['Annotator ID']==botID].shape[0] for botID in probably_bots]
bots_df['%_of_errors'] = bots_df['number_of_errors'] / bots_df['number_of_answers'] * 100
bots_df.head(10)

In [None]:
suspicious_df = pd.DataFrame.copy(bots_df.loc[(bots_df['%_of_errors'] > 30) & (bots_df['number_of_answers'] > 1000)])
suspicious_df['answer_count_True'] = [dataset_df.loc[dataset_df['Annotator ID']==botID, 'Answer'].value_counts()[1] for botID in suspicious_df['botID']]
suspicious_df['answer_count_False'] = [dataset_df.loc[dataset_df['Annotator ID']==botID, 'Answer'].value_counts()[0] for botID in suspicious_df['botID']]
suspicious_df