In [12]:
import pandas as pd
import numpy as np
import torch
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import random
import os

In [13]:
# Check gpu
device = "mps" if torch.backends.mps.is_built() else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [14]:
# For reproducibility, you can define the following function to fix the random seeds.
def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


seed_everything(42)

In [35]:
# Read the data
yelp_review = pd.read_csv('./data/train.csv', header=0)
# make the stars 0, 1, 2
yelp_review['stars'] = yelp_review['stars'] - 1

test_data = pd.read_csv('./data/new_test.csv', header=0)
test_label = list(test_data['stars'])
final_test_data = pd.read_csv('./data/test.csv', header=0)

test_dict = final_test_data.set_index('ID').T.to_dict('list')

final_test_data['ID'] = 0

yelp_review.head()

Unnamed: 0,text,stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,0
1,Terrible food. Terrible service.\n\nThe absol...,0
2,"So, right away if I go into a buffet setting, ...",1
3,I have gotten good cuts from this place. I eve...,2
4,I felt this place was a bit lackluster conside...,1


In [16]:
# Tokenizer from a pretrained model
import torch
from transformers import AutoTokenizer, DebertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")


# Tokenize data
def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized


yelp_review = yelp_review.rename(columns={'stars': 'label'})
test_data = test_data.rename(columns={'stars': 'label'})
final_test_data = final_test_data.rename(columns={'ID': 'label'})

dataset = Dataset.from_pandas(yelp_review)
test_dataset = Dataset.from_pandas(test_data)
fina_test_dataset = Dataset.from_pandas(final_test_data)

eval_dataset = test_dataset.map(preprocess, batched=True, remove_columns=["text"])
train_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
fina_result_dataset = fina_test_dataset.map(preprocess, batched=True, remove_columns=["text"])

Map:   0%|          | 0/901 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1801 [00:00<?, ? examples/s]

In [17]:
num_labels = 3
class_names = [0, 1, 2]

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

from transformers import AutoModel

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", id2label=id2label)

model.to(device)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [18]:
training_args = TrainingArguments(
    evaluation_strategy='steps',
    save_strategy="no",
    learning_rate=5e-6,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    output_dir='./result_debert'
)


def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator

    )

In [19]:
full_finetuning_trainer = get_trainer(
    model
)
init_acc = 0
iteration = 0

In [24]:
for i in range(5):
    full_finetuning_trainer.train()
    iteration += 1
    print(iteration)
    pred_output = full_finetuning_trainer.predict(eval_dataset)
    test_label = list(pred_output[1])
    predictions = np.argmax(pred_output[0], axis=1)
    match = 0
    for i in range(len(predictions)):
        if predictions[i] == test_label[i]:
            match += 1
    current_acc = match / len(predictions)
    print(current_acc)

    final_output = full_finetuning_trainer.predict(fina_result_dataset)

    final_predictions = np.argmax(final_output[0], axis=1)

    labels = range(1, len(final_predictions) + 1)
    predicted = []
    for j in final_predictions:
        predicted.append(j + 1)

    d = {'ID': list(labels), 'stars': predicted}
    df = pd.DataFrame(data=d)
    df_name = 'result_new' + str(iteration) + str(current_acc) + '.csv'
    df.to_csv(df_name, index=False)


Step,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 15.65 GB, other allocations: 2.32 GB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [39]:
while 1:
    predictions_max = list(np.max(final_output[0], axis=1))
    predictions_all = list(np.argmax(final_output[0], axis=1))

    sure = pd.DataFrame({'text': [], 'stars': []})

    # If we are over 95% confident that our prediction is correct, add it to the training set
    for i in range(len(predictions_all)):
        if predictions_max[i] > 0.95:
            sure.loc[len(sure)] = {'text': test_dict[i + 1][0], 'stars': int(predictions_all[i][0])}

    # merge
    adding_rows = sure[~sure['text'].isin(yelp_review['text'])]
    yelp_review = pd.concat([yelp_review, adding_rows], ignore_index=True)

    print(len(yelp_review))
    yelp_review.to_csv('current_train.csv', index=False)
    dataset = Dataset.from_pandas(yelp_review)
    train_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])

    full_finetuning_trainer = get_trainer(model)
    full_finetuning_trainer.train()
    iteration += 1
    print(iteration)
    pred_output = full_finetuning_trainer.predict(eval_dataset)
    test_label = list(pred_output[1])
    predictions = np.argmax(pred_output[0], axis=1)
    match = 0
    for i in range(len(predictions)):
        if predictions[i] == test_label[i]:
            match += 1
    current_acc = match / len(predictions)
    print(current_acc)

    final_output = full_finetuning_trainer.predict(fina_result_dataset)

    final_predictions = np.argmax(final_output[0], axis=1)

    labels = range(1, len(final_predictions) + 1)
    predicted = []
    for j in final_predictions:
        predicted.append(j + 1)

    d = {'ID': list(labels), 'stars': predicted}
    df = pd.DataFrame(data=d)
    df_name = 'result_new' + str(iteration) + str(current_acc) + '.csv'
    df.to_csv(df_name, index=False)


6000


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

RuntimeError: MPS backend out of memory (MPS allocated: 17.37 GB, other allocations: 641.30 MB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [38]:
sure

Unnamed: 0,text,stars


In [28]:
final_output[1]

array([0, 0, 0, ..., 0, 0, 0])