In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
df = pd.read_csv("../data/raw/news/newsapiorg_labelled.csv", index_col=0)
df.head()

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.summary()

In [38]:
import sklearn
from sklearn.model_selection import train_test_split
train_inputs, valid_inputs, train_labels, valid_labels = train_test_split(df.title, df.label, test_size = 0.2)
train = pd.concat([train_inputs, train_labels], axis = 1)
test = pd.concat([valid_inputs, valid_labels], axis = 1)
train.head()

Unnamed: 0_level_0,title,label
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1
1062,Indian rupee slumps 24 paise to 74.79 against ...,-1
1001,Card and Board Games Market 2022-2027 | Top Pl...,0
736,The People Lawmakers Hate to Regulate: Themsel...,-1
587,Welcome to the home for real-time coverage of ...,0
885,$89M flows into Bitcoin funds despite looming ...,-1


In [41]:
def convert_data_to_examples(train, test, title, label): 
  train_InputExamples = train.apply(
    lambda x: InputExample(guid=None, text_a = x[title], text_b = None, label = x[label]), axis = 1)

  validation_InputExamples = test.apply(
    lambda x: InputExample(guid=None, text_a = x[title], text_b = None, label = x[label]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, 'title', 'label')

In [56]:
def convert_examples_to_tf_dataset(examples, tokenizer):
    features = []

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a, 
            add_special_tokens=True,
            max_length=512, 
            return_token_type_ids=True, 
            return_attention_mask=True, 
            padding=True,
            truncation=True
        )
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(
            InputFeatures(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                token_type_ids=token_type_ids, 
                label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [63]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(100).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(100)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)