# **Sarcasm Detector**

## Load Data & Install Necessary Packages

In [1]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0


In [2]:
!gdown --id 1ytPDo88FEC2ArOjdqErAiarAZBNJzEJz

Downloading...
From: https://drive.google.com/uc?id=1ytPDo88FEC2ArOjdqErAiarAZBNJzEJz
To: /content/SarcasmDetect.json
100% 6.06M/6.06M [00:00<00:00, 307MB/s]


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m118.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.26.0


In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tqdm
import transformers
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

%matplotlib inline

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
df = pd.read_json('./SarcasmDetect.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


# **Logistic Regression Model**

# **Prepare the Data**

In [4]:
#Remove empty rows if any
df = df[df['headline'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 894.3+ KB


In [5]:
df.shape

(28619, 3)

In [6]:
df.is_sarcastic.value_counts(normalize = True)

0    0.523603
1    0.476397
Name: is_sarcastic, dtype: float64

In [7]:
# build train and test datasets
X = df['headline'].values
y = df['is_sarcastic'].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((20033,), (8586,))

In [9]:
# Check to make sure DV distributed evening between test and train
temp = pd.DataFrame(y_train)
temp.value_counts(normalize = True)

0    0.525633
1    0.474367
dtype: float64

## **Preprocess the data**

In [10]:
stop_words = nltk.corpus.stopwords.words('english')
#stop_words

In [11]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, flags=re.I|re.A) #re.sub(r'[^a-zA-Z\s]', '', doc), re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)

    # NOT doing this for now filter stopwords out of document
    #filtered_tokens = [token for token in tokens if token not in stop_words]

    # pos tag tokenized text
    tagged_tokens = nltk.pos_tag(tokens) #change to filtered_tokens if filtering stopwords

    # convert raw POS tags into wordnet tags
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    
    # treat unknown tags as nouns by default
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(),
                                            wordnet.NOUN))
                              for word, tag in tagged_tokens]

    #lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in new_tagged_tokens)

    # re-create document from filtered tokens
    #doc = ' '.join(filtered_tokens)
    #return doc
    
    doc = ' '.join(wnl.lemmatize(word, tag) for word, tag in new_tagged_tokens)
    return doc

## **Create a bag of words & vectorize**

In [12]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [31]:
X_train.info() #check to see what format the  data is in. Note that the column is called 0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20033 entries, 0 to 20032
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       20033 non-null  object
dtypes: object(1)
memory usage: 156.6+ KB


In [32]:
X_train = X_train[0].apply(normalize_document) #apply the function to the column
X_test = X_test[0].apply(normalize_document)

In [34]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(X_train)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(X_train)

CPU times: user 909 ms, sys: 42.4 ms, total: 951 ms
Wall time: 948 ms


In [35]:
X_train

0        american express to offer 5 month of paternity...
1        watch dolphin knock standup paddleboarder off ...
2                 man who enjoy thing informed he be wrong
3        jonathan lipnicki to star a young dark helmet ...
4        publicist worry kanye west support of trump wi...
                               ...                        
20028    turnout low than expect for gala central afric...
20029    retreat clinton campaign torch iowa town to sl...
20030    national weather service to give hurricane ful...
20031              christ return for some of his old thing
20032     loophole in curse let archaeologist off the hook
Name: 0, Length: 20033, dtype: object

In [36]:
cv_train_features

<20033x8427 sparse matrix of type '<class 'numpy.int64'>'
	with 209326 stored elements in Compressed Sparse Row format>

In [37]:
%%time

# transform test reviews into features
cv_test_features = cv.transform(X_test)
tv_test_features = tv.transform(X_test)

CPU times: user 256 ms, sys: 795 µs, total: 257 ms
Wall time: 256 ms


## **Analysis & Evaluation: Logistic Regression Model**

In [38]:
%%time

# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, y_train)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

CPU times: user 441 ms, sys: 1.77 ms, total: 442 ms
Wall time: 438 ms


In [39]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative', 'positive']
print(classification_report(y_test, lr_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, lr_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      4455
           1       0.83      0.82      0.82      4131

    accuracy                           0.83      8586
   macro avg       0.83      0.83      0.83      8586
weighted avg       0.83      0.83      0.83      8586



Unnamed: 0,negative,positive
negative,3741,714
positive,743,3388


In [40]:
%%time

# Logistic Regression model on TF-IDF features

# train model
lr.fit(tv_train_features, y_train)

# predict on test data
lr_tfidf_predictions = lr.predict(tv_test_features)

CPU times: user 202 ms, sys: 0 ns, total: 202 ms
Wall time: 201 ms


In [41]:
labels = ['negative', 'positive']
print(classification_report(y_test, lr_tfidf_predictions))
pd.DataFrame(confusion_matrix(y_test, lr_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.84      0.83      0.83      4455
           1       0.82      0.83      0.82      4131

    accuracy                           0.83      8586
   macro avg       0.83      0.83      0.83      8586
weighted avg       0.83      0.83      0.83      8586



Unnamed: 0,negative,positive
negative,3694,761
positive,721,3410


#**NNLM Model with Universal Sentence Encoder**

## **Split the data (no need to preprocess)**

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((20033,), (8586,))

## **Download the embedding and create the model**

In [42]:
model = "https://tfhub.dev/google/universal-sentence-encoder/4"
hub_layer = hub.KerasLayer(model, output_shape=[512], input_shape=[], #passing in the embedding model
                           dtype=tf.string, trainable=True)

In [43]:
# create the model
model = tf.keras.models.Sequential()

model.add(hub_layer)

model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))

model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 256,995,201
Trainable params: 256,995,201


## **Analysis & Evaluation: NNLM Model**

In [47]:
# Fit the model
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=2,
                                      restore_best_weights=True,
                                      verbose=1)

model.fit(X_train, y_train, 
          epochs=10, 
          batch_size=128, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

Epoch 1/10



Epoch 2/10



Epoch 3/10



Epoch 4/10



Epoch 5/10



Epoch 6/10



Epoch 7/10



Epoch 8/10



Epoch 9/10



Epoch 10/10





<keras.callbacks.History at 0x7ff7c01fc430>

In [48]:
predictions = model.predict(X_test, batch_size=512, verbose=0).ravel()
predictions = [1 if prob > 0.5 else 0 for prob in predictions]
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions)*100))
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

Accuracy: 87.00%
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      4455
           1       0.86      0.87      0.87      4131

    accuracy                           0.87      8586
   macro avg       0.87      0.87      0.87      8586
weighted avg       0.87      0.87      0.87      8586



Unnamed: 0,0,1
0,3875,580
1,536,3595


#**BERT(Bi-directional Encoder Representations from Transformers)**

## **Prepare the Data**

In [49]:
X_Train, X_test, y_Train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_Train, y_Train, test_size=0.2, random_state=42)

## **Preprocess the Data**
Headlines need to match BERT's needs

In [51]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [52]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks, all_segments= [], [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        
        tokens = tokenizer.tokenize(doc)
        
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids) # [1,1,1.....] # < 500 ones
        
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
            
        segments = [0] * max_seq_length # [0,0,0...] # 500 zeros This is because we are not trying to predict the next sentence. Therefore, we don't separate between different segments (with 1s and 0s).
        all_ids.append(ids)
        all_masks.append(masks)
        all_segments.append(segments)
        
    encoded = np.array([all_ids, all_masks, all_segments])
    
    return encoded

In [54]:
train_features_ids, train_features_masks, train_features_segments = create_bert_input_features(tokenizer, 
                                                                                               X_train, 
                                                                                               max_seq_length=MAX_SEQ_LENGTH)

val_features_ids, val_features_masks, val_features_segments = create_bert_input_features(tokenizer, 
                                                                                         X_val, 
                                                                                         max_seq_length=MAX_SEQ_LENGTH)
test_features = create_bert_input_features(tokenizer, X_test, max_seq_length=MAX_SEQ_LENGTH)
print('Train Features:', train_features_ids.shape, train_features_masks.shape, train_features_segments.shape)
print('Val Features:', val_features_ids.shape, val_features_masks.shape, val_features_segments.shape)

Converting docs to features: 100%|██████████| 18316/18316 [00:05<00:00, 3298.62it/s]
Converting docs to features: 100%|██████████| 4579/4579 [00:01<00:00, 3484.28it/s]
Converting docs to features: 100%|██████████| 5724/5724 [00:02<00:00, 2574.78it/s]


Train Features: (18316, 30) (18316, 30) (18316, 30)
Val Features: (4579, 30) (4579, 30) (4579, 30)


## **Create the model**

In [53]:
MAX_SEQ_LENGTH = 30

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inp_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_segment_ids")
inputs = [inp_id, inp_mask, inp_segment]

hidden_state = transformers.TFBertModel.from_pretrained('bert-base-uncased')(inputs) 
pooled_output = hidden_state[1] #pooled embedding is 768 9sth like that)

dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output) 
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)

model = tf.keras.Model(inputs=inputs, outputs=output)

model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5, 
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 bert_input_ids (InputLayer)    [(None, 30)]         0           []                               
                                                                                                  
 bert_input_masks (InputLayer)  [(None, 30)]         0           []                               
                                                                                                  
 bert_segment_ids (InputLayer)  [(None, 30)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['bert_input_ids[0][0]',         
                                thPoolingAndCrossAt               'bert_input_masks[0][0]',   

## **Analysis & Evaluation: BERT Model**

In [55]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)
model.fit([train_features_ids,
           train_features_masks, 
           train_features_segments], y_train, 
          validation_data=([val_features_ids, 
                            val_features_masks, 
                            val_features_segments], y_val),
          epochs=3, 
          batch_size=13, #cannot go above 13 - this is because you'll run out of memory in Co-Lab.
          callbacks=[es],
          shuffle=True,
          verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 3: early stopping


<keras.callbacks.History at 0x7ff4e3d6e0a0>

In [56]:
model.save_weights('bert_ft_wts.h5')

In [57]:
test_features_ids, test_features_masks, test_features_segments = create_bert_input_features(tokenizer, 
                                                                                             X_test, #test_reviews, 
                                                                                             max_seq_length=MAX_SEQ_LENGTH)
print('Test Features:', test_features_ids.shape, test_features_masks.shape, test_features_segments.shape)

Converting docs to features: 100%|██████████| 5724/5724 [00:01<00:00, 3352.94it/s]


Test Features: (5724, 30) (5724, 30) (5724, 30)


In [58]:
predictions = [1 if pr > 0.5 else 0 
                   for pr in model.predict([test_features_ids, 
                                            test_features_masks, 
                                            test_features_segments], verbose=0).ravel()]

print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions)*100))
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

Accuracy: 92.87%
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      2995
           1       0.93      0.92      0.92      2729

    accuracy                           0.93      5724
   macro avg       0.93      0.93      0.93      5724
weighted avg       0.93      0.93      0.93      5724



Unnamed: 0,0,1
0,2806,189
1,219,2510


#**DistilBERT (Distilled BERT)**

## **Prepare the Data**

In [67]:
X_Train, X_test, y_Train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_Train, y_Train, test_size=0.2, random_state=42)

## **Preprocess the Data**

In [64]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [65]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks = [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        
        tokens = tokenizer.tokenize(doc)
        
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids)
        
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
            
        all_ids.append(ids)
        all_masks.append(masks)
        
    encoded = np.array([all_ids, all_masks])
    
    return encoded

In [68]:
train_features_ids, train_features_masks = create_bert_input_features(tokenizer, X_train, 
                                                                      max_seq_length=MAX_SEQ_LENGTH)
val_features_ids, val_features_masks = create_bert_input_features(tokenizer, X_val, 
                                                                  max_seq_length=MAX_SEQ_LENGTH)
#test_features = create_bert_input_features(tokenizer, test_reviews, max_seq_length=MAX_SEQ_LENGTH)
print('Train Features:', train_features_ids.shape, train_features_masks.shape)
print('Val Features:', val_features_ids.shape, val_features_masks.shape)

Converting docs to features: 100%|██████████| 18316/18316 [00:05<00:00, 3467.17it/s]
Converting docs to features: 100%|██████████| 4579/4579 [00:01<00:00, 3547.65it/s]

Train Features: (18316, 30) (18316, 30)
Val Features: (4579, 30) (4579, 30)





## **Create the model**

In [66]:
MAX_SEQ_LENGTH = 30

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inputs = [inp_id, inp_mask]

hidden_state = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')(inputs)[0]
pooled_output = hidden_state[:, 0]    

dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)


model = tf.keras.Model(inputs=inputs, outputs=output)
model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5, 
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 bert_input_ids (InputLayer)    [(None, 30)]         0           []                               
                                                                                                  
 bert_input_masks (InputLayer)  [(None, 30)]         0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['bert_input_ids[0][0]',         
 BertModel)                     ast_hidden_state=(N               'bert_input_masks[0][0]']       
                                one, 30, 768),                                                    
                                 hidden_states=None                                         

## **Analysis & Evaluation: DistilBERT Model**

In [69]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)
model.fit([train_features_ids, 
           train_features_masks], y_train, 
          validation_data=([val_features_ids, 
                            val_features_masks], y_val),
          epochs=3, 
          batch_size=20, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 2: early stopping


<keras.callbacks.History at 0x7ff64a3c06a0>

In [70]:
model.save_weights('distillbert_ft_wts.h5')

In [71]:
test_features_ids, test_features_masks = create_bert_input_features(tokenizer, X_test, 
                                                                    max_seq_length=MAX_SEQ_LENGTH)
print('Test Features:', test_features_ids.shape, test_features_masks.shape)

Converting docs to features: 100%|██████████| 5724/5724 [00:01<00:00, 3456.47it/s]

Test Features: (5724, 30) (5724, 30)





In [72]:
predictions = [1 if pr > 0.5 else 0 
                   for pr in model.predict([test_features_ids, 
                                            test_features_masks], batch_size=200, verbose=0).ravel()]

print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions)*100))
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

Accuracy: 91.65%
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      2995
           1       0.91      0.92      0.91      2729

    accuracy                           0.92      5724
   macro avg       0.92      0.92      0.92      5724
weighted avg       0.92      0.92      0.92      5724



Unnamed: 0,0,1
0,2743,252
1,226,2503
