Reference:

https://www.kaggle.com/code/stefancomanita/sentiment-analysis-with-hugging-face-transformers

https://huggingface.co/blog/sentiment-analysis-python

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonreviews/test.ft.txt.bz2
/kaggle/input/amazonreviews/train.ft.txt.bz2


## Exlore data

In [2]:
import bz2
import pandas as pd

In [3]:
NUMBER_OF_LINES = 2000
data = {}

In [4]:
for i, line in enumerate(bz2.open("/kaggle/input/amazonreviews/train.ft.txt.bz2", "rt", encoding="utf8")):

    if i == NUMBER_OF_LINES:
        break

    # label 1 is negativ and label 2 is positive
    label = 0 if line[:10] == "__label__1" else 1
    text = line[10:]

    localResult = {
        "label": label,
        "text": text
    }

    data[i] = localResult


df = pd.DataFrame(data).T
df = df.reset_index().rename(columns= {"index": "Id"})

In [5]:
test_data = {}

for i, line in enumerate(bz2.open("/kaggle/input/amazonreviews/test.ft.txt.bz2", "rt", encoding="utf8")):

    if i == NUMBER_OF_LINES:
        break

    # label 1 is negativ and label 2 is positive
    label = 0 if line[:10] == "__label__1" else 1
    text = line[10:]

    localResult = {
        "label": label,
        "text": text
    }

    test_data[i] = localResult


test_df = pd.DataFrame(test_data).T
test_df = test_df.reset_index().rename(columns= {"index": "Id"})

In [6]:
df.head()

Unnamed: 0,Id,label,text
0,0,1,Stuning even for the non-gamer: This sound tr...
1,1,1,The best soundtrack ever to anything.: I'm re...
2,2,1,Amazing!: This soundtrack is my favorite musi...
3,3,1,Excellent Soundtrack: I truly like this sound...
4,4,1,"Remember, Pull Your Jaw Off The Floor After H..."


In [7]:
df["label"].value_counts()

label
0    1035
1     965
Name: count, dtype: int64

In [8]:
test_df.head()

Unnamed: 0,Id,label,text
0,0,1,Great CD: My lovely Pat has one of the GREAT ...
1,1,1,One of the best game music soundtracks - for ...
2,2,0,Batteries died within a year ...: I bought th...
3,3,1,"works fine, but Maha Energy is better: Check ..."
4,4,1,Great for the non-audiophile: Reviewed quite ...


In [9]:
test_df["label"].value_counts()

label
1    1046
0     954
Name: count, dtype: int64

## Process data

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

In [11]:
train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(test_df)
train_dataset, test_dataset

(Dataset({
     features: ['Id', 'label', 'text'],
     num_rows: 2000
 }),
 Dataset({
     features: ['Id', 'label', 'text'],
     num_rows: 2000
 }))

In [12]:
model_name = "distilbert-base-uncased"

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_size = "right"

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
def preprocess_function(examples):
   return tokenizer(
       examples["text"],
       truncation=True,
       return_tensors=None,
       max_length=512
   )

In [15]:
preprocess_function(train_dataset[0])

{'input_ids': [101, 24646, 5582, 2130, 2005, 1996, 2512, 1011, 27911, 1024, 2023, 2614, 2650, 2001, 3376, 999, 2009, 23262, 1996, 12411, 7301, 1999, 2115, 2568, 2061, 2092, 1045, 2052, 28667, 8462, 4859, 2009, 2130, 2000, 2111, 2040, 5223, 6819, 2094, 1012, 2208, 2189, 999, 1045, 2031, 2209, 1996, 2208, 10381, 4948, 2080, 2892, 2021, 2041, 1997, 2035, 1997, 1996, 2399, 1045, 2031, 2412, 2209, 2009, 2038, 1996, 2190, 2189, 999, 2009, 10457, 2185, 2013, 13587, 9019, 2075, 1998, 3138, 1037, 4840, 2121, 3357, 2007, 24665, 3686, 7334, 1998, 3969, 3993, 19505, 1012, 2009, 2052, 17894, 3087, 2040, 14977, 2000, 4952, 999, 1034, 1035, 1034, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
tokenized_train

Dataset({
    features: ['Id', 'label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 2000
})

In [18]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8,
    return_tensors="pt",
    padding=True
)

## Train model

In [19]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, max_length=512, num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [21]:
import numpy as np
import evaluate
 
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [22]:
compute_metrics(([[0, 1], [0, 1]], [0, 1]))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'accuracy': 0.5, 'f1': 0.6666666666666666}

In [23]:
from transformers import TrainingArguments, Trainer

output_dir = "training"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="tensorboard",
    run_name="distilbert-run",
    logging_steps=10,
    optim="adamw_torch",
    eval_strategy="steps",
    eval_steps=25,
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [24]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
25,0.6649,0.609784,0.843,0.838311
50,0.3315,0.322877,0.8875,0.898144
75,0.2739,0.272184,0.896,0.896723
100,0.2138,0.254277,0.9045,0.910455
125,0.1811,0.319605,0.8915,0.888775
150,0.1269,0.274503,0.907,0.912347
175,0.1792,0.27373,0.9105,0.91464
200,0.1967,0.272775,0.9135,0.916787
225,0.0819,0.275185,0.917,0.920192
250,0.2076,0.275971,0.9145,0.918064


Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}


TrainOutput(global_step=250, training_loss=0.24887674427032472, metrics={'train_runtime': 140.2842, 'train_samples_per_second': 28.514, 'train_steps_per_second': 1.782, 'total_flos': 220193933415936.0, 'train_loss': 0.24887674427032472, 'epoch': 2.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.2759708762168884,
 'eval_accuracy': 0.9145,
 'eval_f1': 0.9180642069956876,
 'eval_runtime': 9.4266,
 'eval_samples_per_second': 212.166,
 'eval_steps_per_second': 13.26,
 'epoch': 2.0}

In [26]:
new_model = "finetuned-distillbert"
trainer.model.save_pretrained(new_model)

Non-default generation parameters: {'max_length': 512}


In [27]:
# Tested in Google Colab
# Unfortunately, Kaggle is not able to show Tensorboard.

from tensorboard import notebook
log_dir = f"{output_dir}/runs"
notebook.start(f"--logdir {log_dir} --port 4000")

<IPython.core.display.Javascript object>

## Inference

In [28]:
from transformers import pipeline
 
sentiment_model = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
sentiment_model(["I love this move", "This movie sucks!"])

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9845169186592102},
 {'label': 'LABEL_0', 'score': 0.9769054651260376}]