In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
     ---------------------------------------- 5.8/5.8 MB 3.5 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-win_amd64.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 5.3 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
     -------------------------------------- 182.4/182.4 kB 3.7 MB/s eta 0:00:00
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers  import BertTokenizer

In [4]:
train_data = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
test_data = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')

In [39]:
train_data['stance'].value_counts()

 1    5538
 0    1012
-1     438
Name: stance, dtype: int64

In [6]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
token = tokenizer.encode_plus(train_data['text'].iloc[0], truncation=True, max_length=512, pad_to_max_length=True, add_special_tokens=True, return_tensors='tf')
token_test = tokenizer.encode_plus(test_data['text'].iloc[0], truncation=True, max_length=512, pad_to_max_length=True, add_special_tokens=True, return_tensors='tf')



In [10]:
token

{'input_ids': <tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[  101,  1271, 14498, 23673,  1289, 14498, 29817, 29824,  1300,
        29817, 23673, 29834, 29837,  1294, 29834, 25573, 29820,  1273,
        29826, 29836, 14498, 17149,  1270, 23673, 25573, 29816, 17149,
        19433,  1270, 23673, 29824, 14498, 17149, 15915, 29819, 19433,
         1270, 23673, 15394, 29836, 25573, 29815,  1294, 25573, 29816,
        29824,  1271, 29836, 23673, 29836,  1284, 14498, 29833, 14498,
         1288, 29823,  1270, 23673, 29825, 29817, 25573, 29815,  1300,
        29834, 29836, 23673,  1270, 15915,  1295, 29823, 25573, 14498,
        25573,  1288, 22192, 17149,  1270, 23673,  1288, 25573, 22192,
         1270, 15915, 14157,  1295, 29836, 14157, 23673,  1294, 23673,
        29820, 29826, 29836, 23673,  1270, 23673, 23673, 29834, 25573,
        29820,  1300, 29830, 15915, 29837,  1300, 29820, 29817, 25573,
        29819,  1270, 23673, 23673, 29834, 25573, 29820,  1288, 22192,
        17149, 

In [40]:
X_input_ids = np.zeros((len(train_data),512))
X_attention_mask = np.zeros((len(train_data),512))

X_input_ids_test = np.zeros((len(test_data),512))
X_attention_mask_test = np.zeros((len(test_data),512))

In [12]:
X_input_ids.shape

(6988, 512)

In [13]:
def generate_training_data(data,ids,masks,tokenizer):
    for i,text in tqdm(enumerate(data['text'])):
        token = tokenizer.encode_plus(text, truncation=True, max_length=512, pad_to_max_length=True, add_special_tokens=True, return_tensors='tf')
        ids[i,:] = token.input_ids
        masks[i,:]=  token.attention_mask
        
    return ids,masks

In [41]:
X_input_ids,X_attention_mask = generate_training_data(train_data,X_input_ids,X_attention_mask,tokenizer)

X_inputs_ids_test,X_attention_mask_test = generate_training_data(test_data,X_input_ids_test,X_attention_mask_test,tokenizer)

0it [00:00, ?it/s]



0it [00:00, ?it/s]

In [43]:
# adding the labels w.r.t how many classes i have
labels=np.zeros((len(train_data),3))
labels.shape

labels_test=np.zeros((len(test_data),3))
labels_test.shape

(1000, 3)

In [44]:
# [nuetral , pos , neg]
labels[np.arange(len(train_data)),train_data['stance'].values]=1
labels_test[np.arange(len(test_data)),test_data['stance'].values]=1

In [45]:
labels
labels_test

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [27]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attention_mask, labels))

In [28]:
def SentimentDatasetMapFunction(input_ids, attention_masks, labels):
    return {'input_ids': input_ids, 'attention_mask': attention_masks}, labels

In [29]:
dataset=dataset.map(SentimentDatasetMapFunction)

In [30]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.float64, name=None)}, TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [31]:
dataset = dataset.shuffle(1000).batch(32,drop_remainder=True)

In [32]:
from transformers import TFBertModel

In [33]:
#downloadin the bert model
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [34]:
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_ids')
attention_masks = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='attention_mask')

bert_embeds = bert_model.bert(input_ids, attention_mask=attention_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu',name='intermediate_layer')(bert_embeds)
output_layer = tf.keras.layers.Dense(3, activation='softmax',name='output_layer')(intermediate_layer)

model=tf.keras.Model(inputs=[input_ids,attention_masks],outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [36]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5,decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [37]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = model.fit(dataset,validation_data=, epochs=2)