In [2]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
#imports
import tensorflow as tf 
tf.__version__
import transformers
import os
import shutil
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification,TFBertForSequenceClassification
from transformers import AutoTokenizer
from transformers import BertTokenizerFast
from transformers import BertTokenizer


In [4]:
#importing and checking the data
def import_and_check_data():
    df = pd.read_csv('/kaggle/input/14-data/1.4.csv', header = None) #1.4 is waseem's dataset
    df.dropna(inplace = True)
    df = df.rename(columns = {0:'tweet', 1:'label'})
    df = df.sample(frac = 1) #shuffle
    df.reset_index(inplace = True)
    df.drop('index',inplace = True, axis=1)
    df
    print(df['label'].value_counts())
    return df


In [21]:
def train_test_split(text,label):
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(text,label, test_size = 0.2, random_state = 32)  
    print('len of x_train ', len(x_train))
    print('len of x_test ', len(x_test))
    return x_train,x_test,y_train,y_test


In [6]:
def train_and_test_ds(x_train,x_test,y_train,y_test):
    
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
    
    x_train, y_train = list(x_train), list(y_train)

    train_encodings = tokenizer(x_train, padding=True, truncation=True)
    train_ds = tf.data.Dataset.from_tensor_slices((
          dict(train_encodings),
          y_train
      ))
    train_ds = train_ds.batch(1)
    
    #test_dataset
    x_test, y_test = list(x_train), list(y_train)
    test_encodings = tokenizer(x_test, padding=True, truncation=True)
    test_ds = tf.data.Dataset.from_tensor_slices(( 
          dict(test_encodings),
          y_test
      ))
    test_ds = test_ds.batch(1)
    
    return train_ds,test_ds

In [7]:
def create_model():
    model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2, id2label={0: 'general', 1: 'toxic'})
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=2e-5,
    decay_steps=10000,
    decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=tf.metrics.SparseCategoricalAccuracy()
                  )
    print(model.summary())
    return model

In [8]:
def fit_model(model, epochs, ds):
    model.fit(x=ds, epochs=epochs)

In [9]:
def get_predictions(model,ds, train_test_vals):
    from sklearn.metrics import classification_report
    y_eval = model.predict(train_ds)
    l = y_eval['logits']
    preds = [] 
    for i in l:
        prediction = tf.round(tf.nn.sigmoid(i))
        res = np.argmax(prediction,0)
        preds.append(res)
    print(classifications_report(train_test_vals, preds))
    return preds
    

In [10]:
def loss_accuracy(model, ds):
    loss, accuracy = model.evaluate(ds)
    print('loss = ', loss)
    print('accuracy = ', accuracy)
    return loss, accuracy

In [11]:
#import pickle
#filename = '1.4_model.sav'
#pickle.dump(model, open(filename, 'wb'))

In [12]:
def get_gradients(text, model, tokenizer):
  def get_correct_span_mask(correct_index, token_size):
    span_mask = np.zeros((1, token_size))
    span_mask[0, correct_index] = 1
    span_mask = tf.constant(span_mask, dtype='float32')
    return span_mask

  embedding_matrix = model.bert.embeddings.weights[0]
  encoded_tokens = tokenizer(text, return_tensors="tf")
  token_ids = list(encoded_tokens["input_ids"].numpy()[0])
  vocab_size = embedding_matrix.get_shape()[0]

  # convert token ids to one hot. We can't differentiate wrt to int token ids hence the need for one hot representation
  token_ids_tensor = tf.constant([token_ids], dtype='int32')
  token_ids_tensor_one_hot = tf.one_hot(token_ids_tensor, vocab_size)

  with tf.GradientTape(watch_accessed_variables=False) as tape:
    # (i) watch input variable
    tape.watch(token_ids_tensor_one_hot)
    # multiply input model embedding matrix; allows us do backprop wrt one hot input
    inputs_embeds = tf.matmul(token_ids_tensor_one_hot,embedding_matrix)

    # (ii) get prediction
    pred_scores = model({"inputs_embeds": inputs_embeds, "attention_mask": encoded_tokens["attention_mask"] } ).logits
    print(pred_scores) 
    max_class = tf.argmax(pred_scores, axis=1).numpy()[0]

    # get mask for predicted score class
    score_mask = get_correct_span_mask(max_class, pred_scores.shape[1])

    # zero out all predictions outside of the correct  prediction class; we want to get gradients wrt to just this class
    predict_correct_class = tf.reduce_sum(pred_scores * score_mask )

    # (iii) get gradient of input with respect to prediction class
    gradient_non_normalized = tf.norm(
        tape.gradient(predict_correct_class, token_ids_tensor_one_hot),axis=2)

    # (iv) normalize gradient scores and return them as "explanations"
    gradient_tensor = (
        gradient_non_normalized /
        tf.reduce_max(gradient_non_normalized)
    )
    gradients = gradient_tensor[0].numpy().tolist()
    token_words = tokenizer.convert_ids_to_tokens(token_ids)

    prediction_label= "toxic" if max_class == 1 else "general"
  return gradients, token_words , prediction_label

In [13]:
def plot_gradients(tokens,gradients, title):
      import matplotlib.pyplot as plt

      """ Plot  explanations
      """
      plt.figure(figsize=(21,3))
      xvals = [ x + str(i) for i,x in enumerate(tokens)]
      colors =  [ (0,0,1, c) for c in (gradients) ]
      # edgecolors = [ "black" if t==0 else (0,0,1, c)  for c,t in zip(gradients, token_types) ]
      # colors =  [  ("r" if t==0 else "b")  for c,t in zip(gradients, token_types) ]
      plt.tick_params(axis='both', which='minor', labelsize=29)
      p = plt.bar(xvals, gradients, color=colors, linewidth=1 )
      plt.title(title)
      p=plt.xticks(ticks=[i for i in range(len(tokens))], labels=tokens, fontsize=12,rotation=90)

In [14]:
def neg_pred_neg(y_test, y_pred):
    neg_pred_neg = []
    #probs_npn = []
    for i in range (0,len(y_test)):
      if y_test[i] == 1 and ans_test[i] == 1:
        neg_pred_neg.append(x_test[i])  
        #probs_npn.append(probs_test[i]) 
    return neg_pred_neg 

In [15]:
def whole_fun(s):
    
    embedding_matrix = model.bert.embeddings.weights[0]
    encoded_tokens = tokenizer(s, return_tensors="tf")
    token_ids = list(encoded_tokens["input_ids"].numpy()[0])
    vocab_size = embedding_matrix.get_shape()[0]

    token_ids_tensor = tf.constant([token_ids], dtype='int32')
    token_ids_tensor_one_hot = tf.one_hot(token_ids_tensor, vocab_size)

    inputs_embeds = tf.matmul(token_ids_tensor_one_hot,embedding_matrix)

    # (ii) get prediction
    pred_scores = model({"inputs_embeds": inputs_embeds, "attention_mask": encoded_tokens["attention_mask"] } ).logits
    #print(pred_scores) 
    print(expit(pred_scores))
    max_class = tf.argmax(pred_scores, axis=1).numpy()[0]

    gradients, words, label = get_gradients(s, model, tokenizer)
    plot_gradients(words, gradients,  f"Prediction: {label} | {s} ")

    def list_of_words_to_change(words, gradients):
        sum_ = 0
        for i in gradients:
            sum_ = sum_ + i
            avg = sum_/len(gradients)

        list_of_words = []
        for i in range(0, len(gradients)):
            if gradients[i] >= avg:
                list_of_words.append(words[i])
                j = i+1
                while j<len(gradients):
                    if words[j][0] == '#' and gradients[j]<avg:
                        list_of_words.append(words[j])
                        j = j+1
                    else:
                        i = j
                        break

        l_w = []
        l_w.append(list_of_words[0])
        cnt=1
        for i in range(1,len(list_of_words)):
            final = list_of_words[i]
            s = list_of_words[i]
            a = ''
            if s[0] == '#':
                j = 0
                for j in range(0,len(s)):
                    if s[j] != '#':
                        a = a + s[j]
                a = l_w[cnt-1] + a
                final = a
                if final in words:
                    l_w[cnt-1] = final
            else:
                l_w.append(final)
                cnt = cnt+1
        return list_of_words
    return list_of_words_to_change(words, gradients)
    
        
    
#print(whole_fun(s))

In [16]:
df = import_and_check_data()
df

0    7366
1    2694
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,News flash If a writer creates a SFC to answer...,0
1,Judgement time mkr hungrycampers,0
2,USER USER Baphomet sent me and my family dea...,0
3,USER Some woman was proper staring at me whi...,1
4,USER The reality is that the caliphate is mo...,1
...,...,...
10055,USER I need to be less uncomfortable talking a...,0
10056,am is early lol mkr,0
10057,Fml LINK,0
10058,Then this happened LINK,0


In [17]:
x_train,x_test,y_train,y_test = train_test_split(df['tweet'], df['label'])


len of x_train  8048
len of x_test  2012


In [18]:
train_ds, test_ds = train_and_test_ds(x_train,x_test,y_train,y_test)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
model = create_model()

Downloading (…)"tf_model.h5";:   0%|          | 0.00/527M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
#fit model on train_ds
fit_model(model,3,train_ds)

Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [None]:
#get_preds_of_train_ds
train_preds = get_predictions(model,train_ds)

#loss_accuracy_on_train_ds
train_loss,train_accuracy = loss_accuracy(model,train_ds)


In [None]:
test_preds = get_predictions(model,test_ds)

test_loss, test_accuracy = loss_accuracy(model,test_ds)

In [None]:
lst = neg_pred_neg(y_test,test_preds)

In [None]:
print(whole_fun('LOL USER call me sexist But i really hate girl gamer teams really really hate lol'))
print()
print(whole_fun('LOL USER call me a misogynist but,i really hate female gamer teams really really hate lol'))

In [None]:
print(whole_fun('USER im not sexist but women just cant be comedians nor can they be rappers'))
print()
print(whole_fun('USER im not misogynistic but ladies really cant be entertainers nor can they be musicians'))

In [None]:
print(whole_fun('USER I swear Im not sexist but get these women outta here when it comes to football  '))
print()
print(whole_fun('USER I swear I dont discriminate against women but get these women out of here when it comes to football  '))
print()
print(whole_fun('USER I promise Im not misogynistic nevertheless get these ladies outta here when it comes to soccer  '))
print()
print(whole_fun('USER Im not a misogynistic at all, but can we please get these girls off the football pitch?'))

In [None]:
print(whole_fun('  USER Im not sexist but women shouldnt be allowed to drive'))
print()
print(whole_fun(" USER Im not misogynistic but ladies oughtn't be permitted to use motorvehicles"))
print()
print(whole_fun(" USER I don't discriminate against the womankind but ladies shouldn't be permitted to use motorvehicles"))
print()
print(whole_fun(" USER I don't discriminate against the womankind but women shouldn't be permitted to use motorvehicles"))
print()
print(whole_fun(" USER I don't discriminate against the womankind but they shouldn't be permitted to use motorvehicles"))


#replaced a word with it's meaning : sexist --> discriminate against women 
#replaced woman repeating again with "THEY" (BASICALLY REDUCED THE NUMBER OF TIMES A TRIGGER WORD IS REPEATED)

In [None]:
print(whole_fun('USER USER Might need some lube  The tip is wider than her hips'))
print()
print(whole_fun('USER USER Might need some lubricant  The end is more widespread than her middle'))
print()
print(whole_fun('USER USER Lubricant may be required The tip exceeds her hips in width.')) #rephrased one
print()
print(whole_fun('USER USER M1ght need some lub3  The t1p is wid3r than h3r h1ps')) #used leet speak on all words that have gradient above average
#leetspeak used on initial sentence
print()
print(whole_fun('USER USER Might need some lube  The tip is wider than her h1ps')) #used leet speak: LEVEL 1 (changed only highest word)
print()
print(whole_fun('USER USER Might need some lube  The tip is w1d3r than h3r h1ps')) #used leet speak: LEVEL 2 (changed second highest words)
print()
print(whole_fun('USER USER M1ght need some lubr1c4nt  The end is more w1d35pr3ad than h3r middle')) #leetspeak on sentence with min prob

In [None]:
print(whole_fun('USER Im not sexist but I do believe that men should provide for the family and the women should stay home and take care of …'))
print() #replace problem words with synonyms (picked synonyms that aren't in the whole dataset, probability reduced obviously)
print(whole_fun('USER Im not misogynistic but I think that menfolk should finance the family and the women should stay home and take care of …',))
print() #paraphrase the original sentence
print(whole_fun('USER Im not a sexist person, but I do think its the mans responsibility to provide for his family, while the woman should stay at home and take care of...'))
print() #rephrase in a slightly complicated way
print(whole_fun('USER While I am not a sexist person, I do feel that males should be the ones who provide for their families while women should remain at home and take care of...'))
print() #leetspeak on the original sentence
print(whole_fun('USER Im not sex1s7 but I do bel13v3 7ha7 m3n sh0uld pr0v1d3 for the fam1ly and the women should st4y home and take care of …'))


In [None]:
print(whole_fun('USER Im not sexist but women working construction just isnt the same'))
print() #replace with synonyms
print(whole_fun('USER Im not misogynistic but ladies working construction just isnt the same'))
print() #paraphrase
print(whole_fun('USER Im not trying to be sexist, but its just not the same to have women working in construction.'))
print() #paraphrase a bit complicated
print(whole_fun('USER I dont mean to be offensive, but theres something missing when female workers build.'))
print() #leetspeak on high gradient words
print(whole_fun(''))