In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import random
import os

# Progress bar
from tqdm.auto import tqdm

2024-03-16 15:37:00.230119: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-16 15:37:00.230222: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-16 15:37:00.368441: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
# For reproducibility, you can define the following function to fix the random seeds.
def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


seed_everything(42)

In [4]:
# Read the data
yelp_review = pd.read_csv('/kaggle/input/dataset/data/train.csv', header=0)
# make the stars 0, 1, 2
yelp_review['stars'] = yelp_review['stars'] - 1

test_data = pd.read_csv('/kaggle/input/dataset/data/new_test.csv', header=0)
test_label = list(test_data['stars'])
final_test_data = pd.read_csv('/kaggle/input/dataset/data/test.csv', header=0)
final_test_data['ID'] = 0

yelp_review.head()

Unnamed: 0,text,stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,0
1,Terrible food. Terrible service.\n\nThe absol...,0
2,"So, right away if I go into a buffet setting, ...",1
3,I have gotten good cuts from this place. I eve...,2
4,I felt this place was a bit lackluster conside...,1


In [5]:
# Tokenizer from a pretrained model
import torch
from transformers import AutoTokenizer, DebertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large")


# Tokenize data
def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized


yelp_review = yelp_review.rename(columns={'stars': 'label'})
test_data = test_data.rename(columns={'stars': 'label'})
final_test_data = final_test_data.rename(columns={'ID': 'label'})

dataset = Dataset.from_pandas(yelp_review)
test_dataset = Dataset.from_pandas(test_data)
fina_test_dataset = Dataset.from_pandas(final_test_data)

eval_dataset = test_dataset.map(preprocess, batched=True, remove_columns=["text"])
train_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
fina_result_dataset = fina_test_dataset.map(preprocess, batched=True, remove_columns=["text"])

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [6]:
num_labels = 3
class_names = [0, 1, 2]

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

from transformers import AutoModel

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-large", id2label=id2label)

model.to(device)

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
   

In [7]:
training_args = TrainingArguments(
    evaluation_strategy='steps',
    save_strategy="no",
    learning_rate=1e-6,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    output_dir='./result_debert'
)


def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator

    )

In [8]:
full_finetuning_trainer = get_trainer(
    model
)
iteration = 0

In [None]:
while 1:
    full_finetuning_trainer.train()
    iteration += 1
    print(iteration)
    if iteration >= 10:
        full_finetuning_trainer.save_model("/kaggle/working/Deb_Large/")
    pred_output = full_finetuning_trainer.predict(eval_dataset)
    test_label = list(pred_output[1])
    predictions = np.argmax(pred_output[0], axis=1)
    match = 0
    for i in range(len(predictions)):
        if predictions[i] == test_label[i]:
            match += 1
    current_acc = match / len(predictions)
    print(current_acc)

    final_output = full_finetuning_trainer.predict(fina_result_dataset)

    final_predictions = np.argmax(final_output[0], axis=1)

    labels = range(1, len(final_predictions) + 1)
    predicted = []
    for j in final_predictions:
        predicted.append(j + 1)

    d = {'ID': list(labels), 'stars': predicted}
    df = pd.DataFrame(data=d)
    df_name = 'result_new' + str(iteration) + str(current_acc) + '.csv'
    df.to_csv(df_name, index=False)
