In [5]:
# ! pip install transformers datasets

In [6]:
# pip install transformers datasets evaluate

In [7]:
# pip install accelerate -U

In [9]:
# !apt install git-lfs

In [1]:
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

print(transformers.__version__)

4.41.2


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, Dataset
from datasets import DatasetDict
import pandas as pd

df = pd.read_csv("steam_reviews.csv", usecols=['text'])
print("First few rows of the dataset:")
print(df.head())

First few rows of the dataset:
                                                text
0  Amazing game, beautiful scenery and content is...
1  This game is great. the design of the areas, e...
2  The only reason I wouldn´t recomend this game ...
3  I'm sorry for leaving a negative review, but I...
4  My complaints have nothing to do with the open...


In [4]:
dataset = Dataset.from_pandas(df)

train_testvalid = dataset.train_test_split(test_size=0.1, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
valid_dataset = test_valid['train']
test_dataset = test_valid['test']

datasets = DatasetDict({
    "train": train_dataset,
    "validation": valid_dataset,
    "test": test_dataset
})

# Print the sizes of each dataset to verify correct proportions
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Optionally, print an example entry from the training dataset
print("Example data entry from training dataset:")
print(datasets["train"][10])

Training dataset size: 227700
Validation dataset size: 12650
Test dataset size: 12651
Example data entry from training dataset:
{'text': 'Once From Software fix the graphic performance of the game, this is probably one of the greathest games of all times.'}


In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"])
    
model_checkpoint = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)    
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=28, remove_columns=["text"])

Map (num_proc=28):   0%|          | 0/227700 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (630 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1044 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Map (num_proc=28):   0%|          | 0/12650 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (585 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=28):   0%|          | 0/12651 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5016 > 512). Running this sequence through the model will result in indexing errors


In [6]:
block_size = tokenizer.model_max_length
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=28,
)

Map (num_proc=28):   0%|          | 0/227700 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/12650 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/12651 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-steam-reviews",
    eval_strategy="steps",
    eval_steps=800,
    save_steps=800,
    save_total_limit=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=10,
)

In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()

import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.6f}")

trainer.push_to_hub()

Step,Training Loss,Validation Loss
800,1.7565,1.538735
1600,1.6034,1.504769
2400,1.5793,1.467056
3200,1.5377,1.438785
4000,1.5018,1.436252
4800,1.4893,1.416798
5600,1.4774,1.400247
6400,1.4705,1.390094
7200,1.4409,1.382143
8000,1.4352,1.382055


Perplexity: 3.511272


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xingchenc/roberta-large-finetuned-steam-reviews/commit/4cd85d916b76809f404faa807db45614f5309bab', commit_message='End of training', commit_description='', oid='4cd85d916b76809f404faa807db45614f5309bab', pr_url=None, pr_revision=None, pr_num=None)