In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset

<br>
<br>
<br>

### Data Collection

In [3]:
# loading sample csv data
data = pd.read_csv("../data/data.csv").head(10)
data.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [4]:
# checking shape
data.shape

(10, 2)

<br>
<br>
<br>

### Data Preparation

In [6]:
# seperating reviews
X = data.review
X.head(3)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
Name: review, dtype: object

In [7]:
# seperating target
y = data.sentiment
y.head(3)

0    positive
1    positive
2    positive
Name: sentiment, dtype: object

In [8]:
# remove html tags from the text

cleaner_regex = re.compile('<.*?>')

def remove_tags(text):
    cleantext = re.sub(cleaner_regex, '', text)
    return cleantext

In [9]:
# removing tags 
x_cleaned = X.apply(lambda x: remove_tags(x))
x_cleaned.head(3)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
Name: review, dtype: object

In [10]:
# binary encoding target
y_cleaned = y.apply(lambda x: 1 if x == 'positive' else 0)
y_cleaned.head(3)

0    1
1    1
2    1
Name: sentiment, dtype: int64

In [11]:
# checking shape
print(x_cleaned.shape)
print(y_cleaned.shape)

(10,)
(10,)


<br>
<br>
<br>

### Fine Tunning

In [12]:
# hugging face model
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

In [13]:
# creating tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)
# loading model
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2, ignore_mismatched_sizes=True)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<br>

In [14]:
# creating huggingface dataset
dataset = Dataset.from_dict(
    {
        'review': list(x_cleaned),
        'labels': list(y_cleaned),
    }
)

dataset

Dataset({
    features: ['review', 'labels'],
    num_rows: 10
})

In [15]:
# return tokens from review
def tokenize_function(example):
    return tokenizer(example["review"], truncation=True, max_length=512)

In [16]:
# tokenizing reviews and creating new dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [17]:
# dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [18]:
# creating tensorflow dataset
tf_train_dataset = tokenized_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
# inspecting
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [20]:
# compiling the model
model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 124055040 
 )                                                               
                                                                 
 classifier (TFRobertaClassi  multiple                 592130    
 ficationHead)                                                   
                                                                 
Total params: 124,647,170
Trainable params: 124,647,170
Non-trainable params: 0
_________________________________________________________________


In [21]:
# traning the model
model.fit(
    tf_train_dataset,
    epochs=1
)



<keras.callbacks.History at 0x1e24bc802e0>

In [22]:
# making prediction

text = "Awesome movie!"
inputs = tokenizer(text, padding = True, truncation = True, max_length=512, return_tensors='tf')
outputs = model(**inputs)
predictions = tf.math.softmax(outputs.logits, axis=-1)
predictions

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[6.8588997e-04, 9.9931407e-01]], dtype=float32)>

In [24]:
predictions.numpy().round(2)
# [negative, positive]

array([[0., 1.]], dtype=float32)

In [27]:
# saving the model
model.save("../models/custom_model")



INFO:tensorflow:Assets written to: custom_model\assets


INFO:tensorflow:Assets written to: custom_model\assets
