In [15]:
!pip install transformers datasets



In [16]:
!pip install accelerate -U



In [17]:
!pip install --upgrade torch torchvision



In [18]:
import numpy as np
import pandas as pd
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

In [37]:
train_data = pd.read_csv('train.csv')

In [38]:
test_data = pd.read_csv('test.csv')

In [39]:
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [40]:
def tokenize_fn(data):
  """
  A small function built to tokenize our data.
  """
  return tokenizer(data['premise'], data['hypothesis'], truncation=True)

In [41]:
rows = len(train_data.index)

In [42]:
rows = len(train_data.index)
n = int(rows*0.8)
train_set = Dataset.from_pandas(train_data.iloc[:n])
val_set = Dataset.from_pandas(train_data.iloc[n:])
test_set = Dataset.from_pandas(test_data)

data = DatasetDict()

data['train'] = train_set
data['validation'] = val_set
data['test'] = test_set

In [43]:
#train_set = Dataset.from_pandas(train_data.iloc[:num])
#validation_set = Dataset.from_pandas(train_data.iloc[num:])
#test_set = Dataset.from_pandas(test_data)

In [44]:
#Converting pandas DataFrames into datasets for the trainer
#data = DatasetDict()

#data['train'] = train_set
#data['validation'] = validation_set
#data['test'] = test_set

In [45]:
tokenized_set = data.map(tokenize_fn, batched=True)

Map:   0%|          | 0/9696 [00:00<?, ? examples/s]

Map:   0%|          | 0/2424 [00:00<?, ? examples/s]

Map:   0%|          | 0/5195 [00:00<?, ? examples/s]

In [46]:
data_collator = DataCollatorWithPadding(tokenizer)

In [47]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [57]:
args= TrainingArguments('/arguments',
                        optim="adamw_torch",
                        num_train_epochs=3,
                        evaluation_strategy="epoch",
                        metric_for_best_model="accuracy")

In [58]:
#Basic compute metrics function with nothing special
from sklearn.metrics import accuracy_score
def compute_metrics(logits_and_labels):
  """
  A basic compute metrics function.  It takes in logits and labels and outputs the accuracy.
  """
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis=-1)
  return {"accuracy": accuracy_score(labels, preds)}

In [59]:
trainer = Trainer(
    model,
    args,
    data_collator=data_collator,
    train_dataset=tokenized_set['train'],
    eval_dataset=tokenized_set['validation'],
    compute_metrics=compute_metrics
)

In [60]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1765,1.154193,0.817244
2,0.109,1.249125,0.829208
3,0.0507,1.277668,0.830858


TrainOutput(global_step=3636, training_loss=0.11747893746798843, metrics={'train_runtime': 1296.6596, 'train_samples_per_second': 22.433, 'train_steps_per_second': 2.804, 'total_flos': 1330974870132960.0, 'train_loss': 0.11747893746798843, 'epoch': 3.0})

In [61]:
preds = trainer.predict(tokenized_set['test']).predictions.astype(float)
preds

array([[-3.47472978, -3.23624873,  6.86812115],
       [-3.30021763,  7.07238102, -3.07941151],
       [ 6.30308533, -2.31111717, -4.20808887],
       ...,
       [ 4.2628746 , -2.33582377, -2.14659953],
       [ 6.45154333, -2.96504378, -3.72112703],
       [-3.38621664, -3.23520041,  6.70183468]])

In [62]:
preds = np.argmax(preds, axis=1, keepdims=True)

In [63]:
sample_submission_df = pd.read_csv('sample_submission.csv')

In [64]:
sample_submission_df['prediction'] = preds
sample_submission_df.tail()

Unnamed: 0,id,prediction
5190,5f90dd59b0,0
5191,f357a04e86,2
5192,1f0ea92118,0
5193,0407b48afb,0
5194,16c2f2ab89,2


In [65]:
sample_submission_df['prediction'] = sample_submission_df['prediction'].clip(0, 2)
sample_submission_df['prediction'] = sample_submission_df.prediction.round().astype(int)

In [66]:
sample_submission_df.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit digit-recognizer -f submission.csv -m "Submission"