In [None]:
import pandas as pd

df1 = pd.read_csv('reviews2.txt', sep='delimiter', header=None, engine='python')
df2 = pd.read_csv('stars.txt', sep='delimiter', header=None, engine='python')

df1.columns = ['review']
df2.columns = ['stars']

df1 = df1[df1["review"].str.startswith("http")==False]
# df1 = df1.drop(df1.iloc[0].name)
df1 = df1.reset_index()
df1.columns = ['n', 'review']
df1 = df1.drop(['n'], axis=1)

df2 = df2[df2["stars"].str.startswith("http")==False]
df2 = df2.reset_index()
df2.columns = ['n', 'stars']
df2 = df2.drop(['n'], axis=1)

df = pd.concat([df1, df2], axis=1, ignore_index=True)
df.columns = ['review', 'stars']

df.head()

Unnamed: 0,review,stars
0,They are great for the price and fit perfectly...,4.0
1,Wife loves these for plugging into Roku remote...,4.0
2,"Excellent quality, sound, and yet, they are no...",4.0
3,A very short review: These are wonderful headp...,4.0
4,Buenos!!!,4.0


In [None]:
! pip install transformers datasets

In [None]:
! pip install transformers datasets evaluate

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
df.rename(columns={'review': 'text', 'stars': 'label'}, inplace=True)
df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(float)
df['label'] = df['label'].astype(int)
df['label'] = df['label'] - 1
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1465 non-null   object
 1   label   1465 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.0+ KB


In [None]:
import datasets

dataset = datasets.Dataset.from_pandas(df)
dataset = dataset.train_test_split(0.3)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_review = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NEGATIVE", 1: "NEGATIVE NEUTRAL", 2: "NEUTRAL", 3: "POSITIVE NEUTRAL", 4: "POSITIVE"}
label2id = {"NEGATIVE": 0, "NEGATIVE NEUTRAL": 1, "NEUTRAL": 2, "POSITIVE NEUTRAL": 3, "POSITIVE": 4}

In [None]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_review['train']) //batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_review['train'],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
tf_validation_set = model.prepare_tf_dataset(
    tokenized_review["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
callbacks = [metric_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks)

Epoch 1/5
 6/65 [=>............................] - ETA: 53s - loss: 1.1864



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f19bf528bb0>