In [None]:
!pip install nvidia-ml-py3 lime torchvision
!pip install wandb evaluate
!pip install wandb


In [None]:
from transformers import AutoTokenizer, MobileBertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import json
from torch.nn.functional import softmax
import numpy as np
from transformers import AdamW, Adafactor, get_linear_schedule_with_warmup
import random
import pandas as pd

import time
import datetime
import csv
import logging

from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import wandb

from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
)
from lime.lime_text import LimeTextExplainer
import torch.nn.functional as F
import torch
from pynvml import (
    nvmlInit,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetCount,
    nvmlDeviceGetName,
)


In [None]:
def print_gpu_utilization():
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        info = nvmlDeviceGetMemoryInfo(handle)
        print("Device", i, ":", nvmlDeviceGetName(handle))
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    torch.cuda.empty_cache()


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

print_gpu_utilization()

In [None]:
dataset = load_dataset("csv", data_files="data/Modified_SQL_Dataset-trainer.csv", split="train")
print(dataset)
dataset = dataset.train_test_split(test_size=0.2)
raw_train = dataset["train"]
raw_test = dataset["test"]

tokenizer = AutoTokenizer.from_pretrained('google/mobilebert-uncased')
batch_size = 32
logging_steps = len(raw_train) // batch_size
lr = 1e-4

default_args = {
    "report_to": "none", # disable WANDB
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "per_device_train_batch_size": batch_size,
    "per_device_eval_batch_size":batch_size,
    "output_dir": "train_model/",
    "learning_rate":lr,
    "weight_decay":0.01,
    "eval_steps":100,
    "logging_steps":100,
    #"save_steps":100,
    "optim": "adamw_torch", #"adamw_torch", #"adafactor",
    "warmup_steps":70,
    "save_strategy": "no", #no epoch steps
    "do_predict": False
    
    
}

train = raw_train.map(lambda x: tokenizer(x["Query"], truncation=True, max_length=512), batched=True, remove_columns=["Query"])
test = raw_test.map(lambda x: tokenizer(x["Query"], truncation=True, max_length=512), batched=True, remove_columns=["Query"])

In [None]:
from transformers.optimization import Adafactor, AdafactorSchedule
import os

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
model = MobileBertForSequenceClassification.from_pretrained('google/mobilebert-uncased')
training_args = TrainingArguments(**default_args)

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    #optimizers=(optimizer, lr_scheduler)
)
result = trainer.train()
print_summary(result)