In [None]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
from google.colab import drive
import pandas as pd
import math
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model

In [None]:
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
data_dir = '/content/gdrive/MyDrive'
filename = 'p2_train.csv'
test_filename = 'p2_test.csv'
!cp $data_dir/$filename .

In [None]:
df_train = pd.read_csv(f'{data_dir}/{filename}')[['text', 'label']]
df_train

Unnamed: 0,text,label
0,longlost pictur archiv celebr glamour seneg wr...,0
1,advanc understand natur world respons current ...,1
2,itali test western economi bear almost total s...,1
3,south africa box granni juke jab way healthier...,1
4,coronaviru pandem nation tragedi hundr thousan...,1
...,...,...
1700,bodi pile morgu iran feel strain coronaviru cn...,0
1701,bbc bank england consid introduct electron ban...,0
1702,need know coronaviru thursday march version st...,0
1703,transcript return transcript main page cnn tra...,0


In [None]:
df_test = pd.read_csv(f'{data_dir}/{test_filename}')[['text', 'label']]
df_test

Unnamed: 0,text,label
0,latest headlin cnn busi great shutdown newsroo...,0
1,china japan bud relationship time coronaviru t...,0
2,coronaviru could hurt boe max crisi new york c...,0
3,hong kong protest unrest crimin gener gener cr...,0
4,new art district aim brighten chennai commun d...,0
...,...,...
421,suggest polit affili play role afraid peopl co...,1
422,stephen sondheim american compos lyricist one ...,1
423,ahm janko survivor brutal boko haram attack no...,1
424,step invest contact trace expand test today us...,1


In [None]:
X_train = df_train['text']
y_train = df_train['label']

X_test = df_test['text']
y_test = df_test['label']

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1705,), (426,), (1705,), (426,))

In [None]:
PAD_TOKEN = "<|pad|>"
EOS_TOKEN = "<|endoftext|>"

MAX_LENGTH = 100
# this will download and initialize the pre trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2",
    pad_token=PAD_TOKEN,
    eos_token=EOS_TOKEN,
    max_length=MAX_LENGTH,
    is_split_into_words=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:

X_train = [str(ex) + EOS_TOKEN for ex in X_train]
X_test = [str(ex) + EOS_TOKEN for ex in X_test]

In [None]:
X_train_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)['input_ids'] for x in X_train]
X_test_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)['input_ids'] for x in X_test]

X_train_in = tf.squeeze(tf.convert_to_tensor(X_train_), axis=1)
X_test_in = tf.squeeze(tf.convert_to_tensor(X_test_), axis=1)



In [None]:

X_train_in[0]

<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 6511, 33224,  2862,   333,  3934,   452,  4681, 27240,   454,
        3308,  1533,  3194,   267, 13034,   289,   692,   392,   269,
       20471, 29445,   636,    72, 33070,  1402,  1097, 13986,   489,
          72,  6576,  1450, 12705,  4676, 26617,  1790,  4190,  2415,
        1302,   410,   274,  8957,  3912,  6576, 23568, 31172,   686,
        1362,   288,   292,   346,  6862, 27240,   273,  4590,  3308,
        1533,   743,  2823,  2042,  2330,   435,   452,  3124,   649,
        2922,  2739,  8408,  3590,  3181,  1657,   890,  2626,  2469,
        1043,  1363,  7521, 21266,  4681,  1352,    72,  2862,   333,
        1204,  7421,  6580, 37189,   954,   380,  1182,  3812,  3485,
        7633,    72, 46754,  1097,  3952, 12198,   312,   686,  1362,
         288], dtype=int32)>

In [None]:
X_train_mask_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)["attention_mask"] for x in X_train]
X_test_mask_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)["attention_mask"] for x in X_test]

X_train_mask = tf.squeeze(tf.convert_to_tensor(X_train_mask_), axis=1)
X_test_mask = tf.squeeze(tf.convert_to_tensor(X_test_mask_), axis=1)
     


In [None]:

model = TFGPT2Model.from_pretrained("gpt2", use_cache=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id)
model.training = True

Downloading tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [None]:

model.resize_token_embeddings(len(tokenizer))

<transformers.modeling_tf_utils.TFSharedEmbeddings at 0x7f5644de5c00>

In [None]:

for layer in model.layers:
    layer.trainable = False

In [None]:

model.summary()

Model: "tfgpt2_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124440576 
 r)                                                              
                                                                 
Total params: 163,038,720
Trainable params: 38,598,144
Non-trainable params: 124,440,576
_________________________________________________________________


In [None]:

input = tf.keras.layers.Input(shape=(None,), dtype='int32')
mask = tf.keras.layers.Input(shape=(None,), dtype='int32')
x = model(input, attention_mask=mask)
#x = x.last_hidden_state[:, -1]
x = tf.reduce_mean(x.last_hidden_state, axis=1)
x = tf.keras.layers.Dense(16, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
output = tf.keras.layers.Dense(3, activation='softmax')(x)

In [None]:

clf = tf.keras.Model([input, mask], output)

In [None]:
clf.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 tfgpt2_model (TFGPT2Model)     TFBaseModelOutputWi  124440576   ['input_1[0][0]',                
                                thPastAndCrossAtten               'input_2[0][0]']                
                                tions(last_hidden_s                                               
                                tate=(None, None, 7                                           

In [None]:
base_learning_rate = 0.0005
optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate)
#loss=tf.keras.losses.BinaryCrossentropy()
loss=tf.keras.losses.SparseCategoricalCrossentropy()

clf.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
     


In [None]:
callbacks = tf.keras.callbacks.EarlyStopping(
        monitor="accuracy", verbose=1, patience=3, restore_best_weights=True)
     


In [None]:

y_train_in = tf.constant(y_train, dtype=tf.int32)
y_test_in = tf.constant(y_test, dtype=tf.int32)

In [None]:

tf.config.experimental_run_functions_eagerly(True)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [None]:
history = clf.fit([X_train_in, X_train_mask], y_train_in, epochs=20, batch_size=32, validation_split=0.2, callbacks=callbacks)


Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
clf.evaluate([X_test_in, X_test_mask], y_test_in)



[0.632915198802948, 0.8333333134651184]

In [None]:

clf.training = False
y_pred = clf.predict([X_test_in, X_test_mask])



In [None]:

y_pred_out = tf.math.argmax(y_pred, axis=-1)
y_pred_out

<tf.Tensor: shape=(426,), dtype=int64, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
     

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
     

In [None]:
print(classification_report(y_test_in, y_pred_out))


              precision    recall  f1-score   support

           0       0.79      0.87      0.83       194
           1       0.88      0.80      0.84       232

    accuracy                           0.83       426
   macro avg       0.83      0.84      0.83       426
weighted avg       0.84      0.83      0.83       426

