In [1]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
%%capture
!python3 -m venv venv
!source venv/bin/activate
!pip install tensorflow transformers

In [3]:
def process_data(str):
  #'/content/drive/MyDrive/~MTSU/research/Corpus/GEN-sarc-notsarc.csv'
  dataframe = pd.read_csv(str,encoding='ISO-8859-1')
  del dataframe['id']
  dataframe['label'] = dataframe['class'].map({'notsarc':0,'sarc':1})
  dataframe['label'] = dataframe['label'].astype('category').cat.codes
  dataframe = dataframe.dropna(how='any',axis=0)
  dataframe = shuffle(dataframe)  
  return dataframe

In [4]:
def train_model(model, x_t, x_v, y_t,  y_v, m_t, m_v):
  history = model.fit(model.fit(x_t, y=y_t, batch_size=BERT_BATCH_SIZE, epochs=NB_EPOCHS), validation_data=(x_v,y_v))
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.legend(['Training Loss','Validation Loss'])
  plt.xlabel('Epoch')
  plt.ylabel('CCE Loss')
  plt.show()
  plt.plot(history.history['categorical_accuracy'])
  plt.plot(history.history['val_categorical_accuracy'])
  plt.legend(['Training','Validation'])
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy (P)')
  plt.show()

In [23]:
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pickle
import pandas as pd
import numpy as np
from numpy import std
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # suppress Tensorflow messages
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.model_selection import train_test_split,KFold , StratifiedKFold
import keras

#DISTILBERT TOKENIZATION




EXAMPLE of TOKENIZATION PROCESS

In [7]:
pd.set_option('display.max_colwidth', None)
%load_ext tensorboard


bert_log_dir='/content/drive/MyDrive/~MTSU/research/data/Distilbert/Distilbert_Model/'
bert_model_save_path='/content/drive/MyDrive/~MTSU/research/data/Distilbert_Model/Distilbert_model.h5'

In [8]:
BERT_EMB_DIM = 200
BERT_CNN_FILTERS = 100
BERT_DNN_UNITS = 256
BERT_OUTPUT_CLASSES = 2
BERT_BATCH_SIZE = 32
BERT_DROPOUT_RATE = 0.5
MAX_LEN = 200
NB_EPOCHS = 4
k = 2
BATCH_SIZE = 32
kf = KFold(n_splits=k, shuffle=False)
MODEL_NAME = 'distilbert-base-uncased'

BERT_callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=bert_model_save_path,save_weights_only=True,monitor='val_accuracy',mode='max',save_best_only=False,save_freq=NB_EPOCHS),keras.callbacks.TensorBoard(log_dir=bert_log_dir)]

In [9]:
sarc_df = process_data('/content/drive/MyDrive/~MTSU/research/Corpus/GEN-sarc-notsarc.csv')

In [10]:
sarc_df.head()

Unnamed: 0,class,text,label
748,notsarc,Then why does the NRA use that quote at this link? \r\n http://www.nraila.org/Issues/FactSheets/Read.aspx?ID=108,0
4152,sarc,You want to explain to me what the age of consent was in 1st century Palestine?\r\nAnd rejoicing really sounds like fear and intimidation to me. emoticonXRolleyes,1
4116,sarc,The Fact that you believe there is some sort of salvation in Jebus proves you are the dumb one.. You should stop rejecting reality ....,1
5253,notsarc,And yet there are plenty of websites written by scientists with the details:,0
5983,sarc,"Let me help you with that. emoticonXAngel She's a conservative!\r\nI'll bet her upcoming debate with Biden will get far more attention than McCain/Obama did! I'll sure be watching! Word has it that the moderator is an Obama lover. \r\nI'm just surprised that so many consider it so important how one performs in these quick witted, personality contests. Just because a person is slightly off the mark in wits doesn't mean he/she would not be a good leader - able to make sound political decisions. \r\n",1


In [11]:
X = sarc_df['text'].to_list()
Y = sarc_df['label'].to_list()

In [12]:
from sklearn.model_selection import train_test_split

# Split Train and Validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(X, Y, test_size=0.2, random_state=0)

# Keep some data for inference (testing)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [14]:
MODEL_NAME = 'distilbert-base-uncased'
phrase = X[0]

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

inputs = tokenizer(phrase, max_length=MAX_LEN, truncation=True, padding=True)

print(f'Phrase: \'{phrase}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')

Phrase: 'Then why does the NRA use that quote at this link? 
 http://www.nraila.org/Issues/FactSheets/Read.aspx?ID=108'
input ids: [101, 2059, 2339, 2515, 1996, 17212, 2050, 2224, 2008, 14686, 2012, 2023, 4957, 1029, 8299, 1024, 1013, 1013, 7479, 1012, 17212, 12502, 2050, 1012, 8917, 1013, 3314, 1013, 8866, 21030, 3215, 1013, 3191, 1012, 2004, 2361, 2595, 1029, 8909, 1027, 10715, 102]
attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [15]:
# We classify two labels in this example. In case of multiclass classification, adjust num_labels value
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                              num_labels=2)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [16]:
def construct_encodings(x, tkzr, max_len, trucation=True, padding=True):
    return tkzr(x, max_length=max_len, truncation=trucation, padding=padding)
    

In [17]:
encodings = construct_encodings(X, tokenizer, MAX_LEN)

In [18]:
input_ids, attention_masks = encodings['input_ids'], encodings['attention_mask']

# START OF k-FOLD 

In [19]:
train_encodings = tokenizer(train_texts,
                            truncation=True,
                            padding=True, 
                            return_tensors="tf")
val_encodings = tokenizer(val_texts,
                          truncation=True,
                          padding=True,
                          return_tensors="tf")

In [20]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [22]:
for train_index, val_index in kf.split(train_encodings, val_encodings):
      index = 0
      print("Training on fold " + str(index+1) + "/2..." )   # Generate batches from indices
      train_dataset = tf.data.Dataset.from_tensor_slices((
          dict(train_encodings),
          train_labels
      ))

      val_dataset = tf.data.Dataset.from_tensor_slices((
          dict(val_encodings),
          val_labels
      ))
     
      model = None
      model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

      optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
      model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
      optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
      model.compile(optimizer=optimizer, 
                    loss=SparseCategoricalCrossentropy(from_logits=True), 
                    metrics=['accuracy'])
      model.fit(train_dataset.shuffle(100).batch(32),
                epochs=3,
                batch_size=32,
                validation_data=val_dataset.shuffle(100).batch(32))
      test_sentence = "With their homes in ashes, residents share harrowing tales of survival after massive wildfires kill 15"
      test_sentence_sarcasm = "News anchor hits back at viewer who sent her snarky note about ‘showing too much cleavage’ during broadcast"

      # replace to test_sentence_sarcasm variable, if you want to test sarcasm
      predict_input = tokenizer.encode(test_sentence,
                                      truncation=True,
                                      padding=True,
                                      return_tensors="tf")

      tf_output = model.predict(predict_input)[0]
      print(tf_output)

      tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
      print(tf_prediction)

      # 9.9978644e-01 = 0.99978644
      # 2.1356659e-04 = 0.00021356659
      # => sentiment = 0
      predict_input

Training on fold 1/2...


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3
[[ 4.757387   0.4234966 -2.9918168 -3.1696095 -2.7723002]]
[9.8576051e-01 1.2929648e-02 4.2494724e-04 3.5573001e-04 5.2926078e-04]
Training on fold 1/2...


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3
[[ 4.9733663  0.839418  -3.564312  -3.6232696 -3.734241 ]]
[9.8370451e-01 1.5758457e-02 1.9275174e-04 1.8171598e-04 1.6262932e-04]


In [26]:
model.save_pretrained("/tmp/sentiment_custom_model")

In [None]:
#### Load saved model and run predict function

In [27]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("/tmp/sentiment_custom_model")

Some layers from the model checkpoint at /tmp/sentiment_custom_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /tmp/sentiment_custom_model and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
test_sentence = "With their homes in ashes, residents share harrowing tales of survival after massive wildfires kill 15"
test_sentence_sarcasm = "News anchor hits back at viewer who sent her snarky note about ‘showing too much cleavage’ during broadcast"

# replace to test_sentence_sarcasm variable, if you want to test sarcasm
predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

tf_output = loaded_model.predict(predict_input)[0]
print(tf_output)

[[ 4.2609034  2.767987  -3.9525511 -4.2107472 -3.9016755]]


In [32]:
tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
print(tf_prediction)

# 9.9978644e-01 = 0.99978644
# 2.1356659e-04 = 0.00021356659
# => sentiment = 0

[8.1600565e-01 1.8336980e-01 2.2112370e-04 1.7080549e-04 2.3266459e-04]


In [30]:
predict_input

<tf.Tensor: shape=(1, 21), dtype=int32, numpy=
array([[  101,  2007,  2037,  5014,  1999, 11289,  1010,  3901,  3745,
        24560,  2075,  7122,  1997,  7691,  2044,  5294,  3748, 26332,
         3102,  2321,   102]], dtype=int32)>