In [1]:
# pip install transformers
# pip install protobuf==4.21
import tensorflow as tf
import transformers
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping
from matplotlib import pyplot

In [2]:
# We are going to be training pretty large models. In order not to face errors, we need
# to set tensorflow option to grow GPU memory allocation when required.

physical_devices = tf.config.list_physical_devices('GPU') 
if len(physical_devices)>0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
# Loading datasets

df_train=pd.read_csv('train_news.csv')
df_test=pd.read_csv('test_news.csv')

In [4]:
# Labeling data (changing labels 1..4 for 0..3 for a Dense layer to be within the requested range)

df_train[['Class Index']]=df_train[['Class Index']].replace(to_replace=[1,2,3,4],value=[0,1,2,3])
df_test[['Class Index']]=df_test[['Class Index']].replace(to_replace=[1,2,3,4],value=[0,1,2,3])
# df_train[['Class Index']]=df_train[['Class Index']].replace(to_replace=[0,1,2,3],value=['World', 'Sports', 'Business', 'Sci/Tech'])
# df_test[['Class Index']]=df_test[['Class Index']].replace(to_replace=[0,1,2,3],value=['World', 'Sports', 'Business', 'Sci/Tech'])

In [5]:
# Checking length

print(f"Length of train dataset = {len(df_train)}")
print(f"Length of test dataset = {len(df_test)}")

Length of train dataset = 120000
Length of test dataset = 7600


In [6]:
df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   Title        120000 non-null  object
 2   Description  120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class Index  7600 non-null   int64 
 1   Title        7600 non-null   object
 2   Description  7600 non-null   object
dtypes: int64(1), object(2)
memory usage: 178.2+ KB


(None, None)

In [7]:
df_train.head()

Unnamed: 0,Class Index,Title,Description
0,2,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,2,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,2,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [8]:
df_test.head()

Unnamed: 0,Class Index,Title,Description
0,2,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,3,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,3,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,3,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,3,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [9]:
# Creating a combined column of Titles and Descriptions

df_train['Combined']=df_train['Title']+' '+df_train['Description']
df_test['Combined']=df_test['Title']+' '+df_test['Description']

In [10]:
# To load the model from Internet repository using model name. 
# Use this if you are running from your own copy of the notebooks
bert_model = 'bert-base-uncased' 

# To load the model from the directory on disk. Use this for Microsoft Learn module, because we have
# prepared all required files for you.
#bert_model = './bert'

tokenizer = transformers.BertTokenizer.from_pretrained(bert_model)

MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

In [11]:
tokenizer.encode("Let's check how it works")

[101, 2292, 1005, 1055, 4638, 2129, 2009, 2573, 102]

In [12]:
tokenizer(['Hello, world'],return_tensors='tf')

{'input_ids': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[ 101, 7592, 1010, 2088,  102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[1, 1, 1, 1, 1]])>}

In [13]:
# Checking an output format - it's a dictionary with k:v pairs

tokenizer(df_test['Combined'][0],padding='max_length',max_length=MAX_SEQ_LEN,truncation=True)

{'input_ids': [101, 10069, 2005, 1056, 1050, 11550, 2044, 7566, 9209, 5052, 3667, 2012, 6769, 2047, 8095, 2360, 2027, 2024, 1005, 9364, 1005, 2044, 7566, 2007, 16654, 6687, 3813, 2976, 9587, 24848, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [14]:
# When training the model, we need to provide tokenized sequence as input for train and test data

df_train['Tokens']=[
    tokenizer(x,
    padding='max_length',max_length=MAX_SEQ_LEN,truncation=True)['input_ids']
    for x in df_train['Combined']
]
df_train['Tokens']

0         [101, 2813, 2358, 1012, 6468, 15020, 2067, 204...
1         [101, 18431, 2571, 3504, 2646, 3293, 13395, 10...
2         [101, 3514, 1998, 4610, 6112, 15768, 1005, 176...
3         [101, 5712, 9190, 2015, 3514, 14338, 2013, 236...
4         [101, 3514, 7597, 2061, 2906, 2000, 2035, 1011...
                                ...                        
119995    [101, 4501, 1005, 1055, 14163, 7377, 11335, 25...
119996    [101, 9278, 11610, 6608, 1037, 2327, 1011, 111...
119997    [101, 7842, 8193, 2025, 2183, 2000, 13600, 266...
119998    [101, 2651, 1005, 1055, 5088, 2399, 6278, 2012...
119999    [101, 16996, 2131, 5708, 2013, 9680, 6591, 950...
Name: Tokens, Length: 120000, dtype: object

In [15]:
# When training the model, we need to provide tokenized sequence as input for train and test data

df_test['Tokens']=[
    tokenizer(x,
    padding='max_length',max_length=MAX_SEQ_LEN,truncation=True)['input_ids']
    for x in df_test['Combined']
]
df_test['Tokens']

0       [101, 10069, 2005, 1056, 1050, 11550, 2044, 75...
1       [101, 1996, 2679, 2003, 2006, 1024, 2117, 2797...
2       [101, 18712, 1012, 2194, 5222, 3946, 2000, 281...
3       [101, 17547, 3131, 7126, 19939, 3748, 26332, 1...
4       [101, 10250, 10128, 1012, 8704, 2000, 5787, 38...
                              ...                        
7595    [101, 2105, 1996, 2088, 5969, 4883, 4018, 1348...
7596    [101, 11675, 2003, 3561, 2007, 12223, 2007, 19...
7597    [101, 10337, 3727, 8618, 2066, 5074, 27277, 21...
7598    [101, 1019, 1997, 27641, 5022, 1999, 5264, 220...
7599    [101, 1041, 15907, 4152, 2046, 12635, 2015, 10...
Name: Tokens, Length: 7600, dtype: object

In [16]:
# Converting Tokens to tensors

x_test=[tf.convert_to_tensor(x, dtype=tf.int32) for x in df_test['Tokens']]
x_train=[tf.convert_to_tensor(x, dtype=tf.int32) for x in df_train['Tokens']]

In [17]:
# The model contains almost 110 million parameters! 

model = transformers.TFBertForSequenceClassification.from_pretrained(bert_model,num_labels=4,output_attentions=False)

model.summary()

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
Total params: 109485316 (417.65 MB)
Trainable params: 109485316 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
# Since we want simple classification task on relatively small dataset, 
# we do not want to train the BERT base layer

model.layers[0].trainable = False
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
Total params: 109485316 (417.65 MB)
Trainable params: 3076 (12.02 KB)
Non-trainable params: 109482240 (417.64 MB)
_________________________________________________________________


In [19]:
# Initiating the model 

model.compile('adam','sparse_categorical_crossentropy',['acc'])
tf.get_logger().setLevel('ERROR')

history=model.fit(tf.cast(x_train, tf.int32), df_train['Class Index'].values,
          validation_data=[tf.cast(x_test, tf.int32), df_test['Class Index'].values],
          batch_size=32,steps_per_epoch=32,validation_steps=2)



In [20]:
# The model is very large and needs more training to improve the results 
# (which are likely to be the best comparing to RNN and self-made Transformers I used in the previous cases)

history.history

{'loss': [8.8484468460083],
 'acc': [0.2421875],
 'val_loss': [11.94973373413086],
 'val_acc': [0.234375]}