In [None]:
import gc

import torch
from torch.optim import AdamW
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader
torch.manual_seed(73)

from datasets import Dataset
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

### подгрузка предобученной модели и токенайзера

In [None]:
hugging_local_path = '/home/datalab/nfs/pretrained_models/rubert-tiny2_'
model = AutoModelForSequenceClassification.from_pretrained(hugging_local_path, local_files_only=True, num_labels=aisc_dataset.target.nunique())
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(hugging_local_path, local_files_only=True) 

### формирование выборок train/valin/test

In [None]:
# tokenized_texts = tokenizer(df['input'].tolist(), padding=True, truncation=True, return_tensors='pt')
# tokenized_texts['labels'] = torch.Tensor(df['target']).int()
# data = Dataset.from_dict(tokenized_texts)

data = Dataset.from_pandas(df[['input', 'label']])
                              
data = data.train_test_split(test_size=0.2, shuffle=True, seed=1)
data_test = data['test'].train_test_split(test_size=0.5, shuffle=True, seed=1)
data['valid'] = data_test['train']
data['test'] = data_test['test']
data

### создание DataLoader

##### способ #1

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch_size = 32
train_dataloader = DataLoader(data['train'], batch_size=batch_size, shuffle=True, collate_fn=collator)
valid_dataloader = DataLoader(data['valid'], batch_size=batch_size, shuffle=True, collate_fn=collator)
test_dataloader = DataLoader(data['test'], batch_size=batch_size, collate_fn=collator)
cleanup()

##### способ #2

In [None]:
def my_collator(batch):
    batch_df = pd.DataFrame(batch)
    tokenized_texts = tokenizer(batch_df['input'], padding=True, truncation=True, return_tensors='pt')
    tokenized_texts['labels'] = torch.Tensor(batch_df['target']).int()
    return tokenized_texts


batch_size = 32
dataloader = DataLoader(data['train'], batch_size=batch_size, collate_fn=my_collator)
dataloader = DataLoader(data['valid'], batch_size=batch_size, collate_fn=my_collator)
dataloader = DataLoader(data['test'], batch_size=batch_size, collate_fn=my_collator)
cleanup()

##### способ #3

In [None]:
# torch.utils.data.Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, lag_data, ftr_data, labels, tokenizer):
        self.lag_data = lag_data
        self.ftr_data = ftr_data
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.lag_data)
    
    def __getitem__(self, idx):
        # print('1')
        inp = self.lag_data[idx]
        lbl = self.labels[idx]
        tokenized_inp = self.tokenizer(inp, padding=True, truncation=True, return_tensors='pt')
        return {'input_ids': tokenized_inp['input_ids'].flatten(),
                'attention_mask': tokenized_inp['attention_mask'].flatten(),
                'labels': torch.Tensor([lbl]).long().squeeze()
               }
    
data = DatasetDict()
data['train'] = CustomDataset(df_train['input'].tolist(), df_train['target'].tolist(), tokenizer)
data['valid'] = CustomDataset(df_valid['input'].tolist(), df_valid['target'].tolist(), tokenizer)
data['test'] = CustomDataset(df_test['input'].tolist(), df_test['target'].tolist(), tokenizer)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch_size = 32
train_dataloader = DataLoader(data['train'], batch_size=batch_size, shuffle=True, collate_fn=collator)
valid_dataloader = DataLoader(data['valid'], batch_size=batch_size, shuffle=True, collate_fn=collator)
test_dataloader = DataLoader(data['test'], batch_size=batch_size, collate_fn=collator)

### обучение модели

In [None]:
# ========================================
#             Training config
# ========================================
max_epochs = 10
early_stopping_count = 5
window = 100
model_save_path = '/home/datalab/nfs/data/for_temp_files'
model_save_name = 'sgm_type'
# ========================================
#              LR optimizer
# ========================================
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps =  len(train_dataloader) * max_epochs
num_warmup_steps = int(num_training_steps * 0.10)
lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
                                                                  num_warmup_steps=num_warmup_steps,
                                                                  num_training_steps=num_training_steps,
                                                                  num_cycles=3)

lr_list = []
val_losses = []
val_best_loss = 9e9
for epoch in tqdm(range(max_epochs)):
    # ========================================
    #                Training
    # ========================================
    model.train()
    tr_tq = tqdm(train_dataloader)
    train_loss = 0
    train_losses = []
    for batch in tr_tq:
        outputs = model(**batch.to(model.device))
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        lr_list.append(optimizer.param_groups[0]['lr'])
        train_loss += loss.item()
        train_losses.append(loss.item())
        tr_tq.set_description(f'wind loss: {np.mean(train_losses[-window:]):2.2f}')
    avg_train_loss = train_loss / len(train_dataloader)
    cleanup()
    # ========================================
    #               Validation
    # ========================================
    model.eval()
    val_loss = 0
    vl_tq = tqdm(valid_dataloader)
    vl_tq.set_description(f'valid scoring...')
    for batch in vl_tq:
        with torch.no_grad():
            outputs = model(**batch.to(model.device))
        val_loss += outputs.loss.item()
        
    avg_val_loss = val_loss / len(valid_dataloader)
    val_losses.append(avg_val_loss)
    if avg_val_loss < val_best_loss:
        val_best_loss = avg_val_loss
    cleanup()
    
    print(f'epoch {epoch+1}. train loss: {avg_train_loss:4.2f}  val loss: {avg_val_loss:4.2f}')
    print('-'*74)
    model.save_pretrained(f'{model_save_path}/{model_save_name}_{epoch+1}')
    
    # ========================================
    #              Early stopping
    # ========================================
    if len(val_losses) > early_stopping_count:
        if val_best_loss < min(val_losses[-early_stopping_count:]):
            print('\n!early stopping')
            break
            
# ========================================
#               Best model
# ========================================
best_model_index = np.argmin(val_losses)+1
print(f' best model - on {best_model_index}th epoch')

# ========================================
#                Cleaning
# ========================================
cleanup()
for file_i in glob(f'{model_save_path}/{model_save_name}_*'):
    file_epoch = re.findall("\d+", file_i)[-1]
    if int(file_epoch) != best_model_index:
        shutil.rmtree(file_i)

### тестирование

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(f'{model_save_path}/{model_save_name}_{best_model_index}', local_files_only=True)
model.to(device)
print('model loaded')

In [None]:
model.eval()
tst_tq = tqdm(valid_dataloader)
tst_tq.set_description(f'test scoring...')
true_list = []
pred_list = []
for batch in tst_tq:
    with torch.no_grad():
        outputs = model(**batch.to(model.device))
    true_values = batch['labels'].tolist()
    pred_values = F.softmax(outputs.logits, dim=1).max(dim=1).indices.tolist()
    true_list.append(true_values)
    pred_list.append(pred_values)
        
true_list = list(itertools.chain(*true_list))
pred_list = list(itertools.chain(*pred_list))

In [None]:
metrics_df = pd.DataFrame(classification_report(true_list, pred_list, output_dict=True, zero_division=1)).T
rev_label_mapping = {str(v):k for k,v in label_mapping.items()}
metrics_df['model_class'] = metrics_df.index.map(rev_label_mapping)
metrics_df.drop(index='accuracy', inplace=True)
metrics_df = metrics_df[['model_class','precision','recall','f1-score','support']]
metrics_df