In [3]:
import torch
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import classification_report
from transformers import TOKENIZER_MAPPING, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import os
from dataset import Dataset

In [5]:
TOKENIZER_NAME = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"
MODEL_NAME = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"
LEARNING_RATE = 3e-5

#OUTPUT_FILE = "NODUP-paraphrase-roberta-kan-pickle.md"

EPOCHS = 4
BATCH_SIZE = 24 
os.environ["CUDA_VISIBLE_DEVICES"]="1"

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print(f'We will use the GPU:{torch.cuda.get_device_name()} ({device})')

else:
    print('NO GPU AVAILABLE ERROR')
    device = torch.device("cpu")
   
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
#model = AutoModelForSequenceClassification.from_pretrained("../../task_a/pickles_mal/", num_labels=5, output_attentions=True)
model_phobia = AutoModelForSequenceClassification.from_pretrained("../pickles_mixed_all/", num_labels=3, output_attentions=True) 
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, no_deprecation_warning=True)
model_phobia.to(device)
data = Dataset()
# eng_train, eng_val, tam_train, tam_val, mal_train, mal_val, eng_tam_train, eng_tam_val
_, _, _, _, _, _, _, datatrain = data.get_phobia_dataset(tokenizer, balance=False)
#_,_, kan_train_2022, _, _,_ = data.get_fire_2022_dataset(tokenizer, balance=False)

train_dataloader = DataLoader(
            datatrain,
            sampler = RandomSampler(datatrain),
            batch_size = BATCH_SIZE)

total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


There are 1 GPU(s) available.
We will use the GPU:Tesla V100-SXM2-32GB (cuda)


In [None]:
def inference_validation(model, tokenizer, device, output_file, dataset, BS=16):
    _, eng_val, _, tam_val, _, mal_val, _, eng_tam_val = data.get_phobia_dataset(tokenizer, balance=False)

    if dataset == 'tam':
        loader = DataLoader(tam_val, sampler = SequentialSampler(tam_val), batch_size=BS)
    elif dataset == 'eng':
        loader = DataLoader(eng_val, sampler = SequentialSampler(eng_val), batch_size=BS) 
    elif dataset == 'mal':
        loader = DataLoader(mal_val, sampler = SequentialSampler(mal_val), batch_size=BS) 
    elif dataset == 'eng_tam':
        loader = DataLoader(eng_tam_val, sampler = SequentialSampler(eng_tam_val), batch_size=BS) 

    print(f"{dataset} validation: {len(loader) * BS}")
    
    vbar = tqdm(enumerate(loader), total=len(loader), desc= dataset + " validation")

    model.eval()
    
    true_labels = []
    pred_labels = []
    #total_eval_loss = 0
    
    # Label names: Index(['Mixed_feelings', 'Negative', 'Positive', 'not-Tamil', 'unknown_state']
    # Label names: Index(['Homophobic', 'Non-anti-LGBT+ content', 'Transphobic']
    
    for step, batch in vbar:
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad(): 
            outputs_two = model_phobia(input_ids=b_input_ids, attention_mask=b_masks,
                                            labels=b_labels)
            
            #total_eval_loss += outputs.loss.item()
            logits_two = outputs_two.logits.detach().cpu().numpy().tolist() 
            label_ids = b_labels.to('cpu').numpy().tolist()

            true_labels.extend(label_ids)

            #print(f"Predicted: {pred_labels[-1]}, {np.argmax(i)}")
            #print(f"{pred_labels[-24:]},{np.argmax(logits_two, axis=1)}")
            #pred_labels.extend(np.argmax(logits,axis=1))
        
        #f = open("../outputs/sentiment", 'a')
        #f.write(str(pred_labels))
    print(classification_report(pred_labels, true_labels))
    
    model.train()

In [None]:
OUTPUT_FILE = "BIAOJSDOIJASD"
inference_validation(model=model_phobia, tokenizer=tokenizer, device=device, output_file=OUTPUT_FILE, BS=BATCH_SIZE, dataset='mal')

NameError: name 'data' is not defined