In [83]:
import numpy as np
import pandas as pd
import torch
import time
import datetime


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


from transformers import AutoTokenizer, BertForSequenceClassification, \
    AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from scipy.special import softmax

In [84]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

Когда мы решаем задачу на двух предложениях, нужно посередине вставлять токен [SEP], это можно сделать так:

In [85]:
encoding = tokenizer("This is some text", "this is another text", padding="max_length", truncation=True)
tokenizer.decode(encoding["input_ids"])[:100]

'[CLS] This is some text [SEP] this is another text [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] ['

Ниже нужно прочитать настоящие размеченные данные:

In [86]:
markup_english = pd.read_csv("new_results_english.tsv", sep="\t")
markup_spanish = pd.read_csv("new_results_spanish.tsv", sep="\t")
markup_portuguese = pd.read_csv("results_portuguese.tsv", sep="\t")

In [87]:
markup = pd.concat([markup_english, markup_spanish, markup_portuguese])
markup.shape

(3090, 8)

In [88]:
markup = markup.rename(columns={
    "INPUT:channel_id": "channel_id",
    "INPUT:query": "query",
    "INPUT:title": "title",
    "INPUT:snippet": "snippet",
    "OUTPUT:result": "label"
})[["channel_id", "query", "title", "snippet", "label"]].reset_index(drop=True)

markup["label"] = markup["label"].map({"yes": 1, "no": 0})

markup

Unnamed: 0,channel_id,query,title,snippet,label
0,587,"Maestra, Papercup, Vitra.ai, Aloud.ai",Inflexor Ventures Announces Its Final Close At...,Inflexor has so far invested in four startups ...,1
1,556,Rimini Street,Curriculum Dr Babasaheb Ambedkar Marathwada Un...,Eligible and interested candidates may send an...,0
2,770,Jalasoft,Jalasoft busca personas para el cargo de Respo...,Jalasoft busca personas para el cargo de Respo...,1
3,1412,"Gettransfer, Gettransfer.com, GetExperience.co...",lifetouch promo code 2022,2008NMAP4STU Free Shipping on $10+ Orders - FA...,0
4,1484,"trtworld, TRTWorld, TRT World, trt world",OpIndia.com,"In fact, TRT World had launched a promotion of...",1
...,...,...,...,...,...
3085,615,Santander Brasil,Será que vale a pena investir no negócio de en...,"No ramo da energia solar, diversos bancos, com...",1
3086,520,Itaú Unibanco,Cubo for Devs tem inscrições até o dia 24 de O...,O Cubo também apoia as transformações digital ...,1
3087,619,Quintoandar,QuintoAndar Traz Martinho Da Vila Em Nova Camp...,QuintoAndar Traz Martinho Da Vila Em Nova Camp...,1
3088,763,Amaggi,Quais os 5 estados maiores produtores de soja ...,"Juntas, Grupo Amaggi, SLC Agrícola e Grupo Bom...",0


In [89]:
markup["label"].value_counts()

1    1947
0    1143
Name: label, dtype: int64

In [90]:
labels = markup["label"].values

In [91]:
input_ids = []
token_type_ids = []
attention_masks = []

In [92]:
for i, row in tqdm(markup.iterrows()):
    query = row["query"]
    document = row["title"] + " " + row["snippet"]
    label = row["label"]
    
    encoded_dict = tokenizer.encode_plus(
        query, document,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_attention_mask=True, 
        return_tensors='pt',
    )
    
    input_ids.append(encoded_dict['input_ids'])
    token_type_ids.append(encoded_dict["token_type_ids"])
    attention_masks.append(encoded_dict['attention_mask'])

0it [00:00, ?it/s]

In [93]:
input_ids = torch.cat(input_ids, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [94]:
dataset = TensorDataset(input_ids, token_type_ids, attention_masks, labels)

In [95]:
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [96]:
batch_size = 32

In [97]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False
)

In [98]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2, # binary classification
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [99]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5, # args.learning_rate - default is 5e-5
    eps=1e-8, # args.adam_epsilon  - default is 1e-8.
    weight_decay=1e-7, 
)

In [100]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [101]:
def flat_roc_auc(preds, labels):
    preds = preds[:, 1].ravel()
    labels = labels.ravel()
    return roc_auc_score(labels, preds)

In [102]:
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

In [103]:
training_stats = []

In [104]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
model = model.to(device)

In [106]:
torch.cuda.empty_cache()
for epoch_i in range(epochs):
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    # Measure how long the training epoch takes.
    t0 = time.time()

    total_train_loss = 0
    
    model.train()

    for step, batch in tqdm(enumerate(train_dataloader)):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        model.zero_grad()        

        output = model(b_input_ids, 
                       token_type_ids=b_token_type_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels)

        loss = output.loss
        logits = output.logits
        total_train_loss += loss.item()

        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    val_labels = []
    val_probs = []

    # Evaluate data for one epoch
    for batch in tqdm(val_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            output = model(b_input_ids, 
                      token_type_ids=b_token_type_ids, 
                      attention_mask=b_input_mask,
                      labels=b_labels)
            loss = output.loss
            logits = output.logits
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        
        val_labels.append(label_ids)
        val_probs.append(softmax(logits, axis=1))

        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    val_labels = np.concatenate(val_labels)
    val_probs = np.concatenate(val_probs)
    
    print("  ROC-AUC: {0:.2f}".format(flat_roc_auc(val_probs, val_labels)))

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )




0it [00:00, ?it/s]

  Batch    40  of     68.    Elapsed: 0:00:31.

  Average training loss: 0.62
  Training epcoh took: 0:00:54

Running Validation...


  0%|          | 0/29 [00:00<?, ?it/s]

  ROC-AUC: 0.78
  Accuracy: 0.72
  Validation Loss: 0.55
  Validation took: 0:00:07



0it [00:00, ?it/s]

  Batch    40  of     68.    Elapsed: 0:00:32.

  Average training loss: 0.53
  Training epcoh took: 0:00:54

Running Validation...


  0%|          | 0/29 [00:00<?, ?it/s]

  ROC-AUC: 0.77
  Accuracy: 0.73
  Validation Loss: 0.57
  Validation took: 0:00:07



0it [00:00, ?it/s]

  Batch    40  of     68.    Elapsed: 0:00:34.

  Average training loss: 0.53
  Training epcoh took: 0:00:56

Running Validation...


  0%|          | 0/29 [00:00<?, ?it/s]

  ROC-AUC: 0.78
  Accuracy: 0.73
  Validation Loss: 0.57
  Validation took: 0:00:07


In [128]:
model.save_pretrained("./models/pretrained-bert.pt")

#  Predictions

### Data preparation

In [115]:
fake_test_dataset = markup.loc[np.random.choice(markup.shape[0], size=400, replace=False)]
fake_test_dataset["url"] = ""
fake_test_dataset = fake_test_dataset.rename(columns={
    "channel_id": "id", 
    "query": "Query", 
    "title": "Title", 
    "snippet": "Snippet"
})[["id", "url", "Query", "Title", "Snippet"]].reset_index(drop=True)
fake_test_dataset.to_csv("fake_test_dataset.csv", index=False)

fake_test_dataset

Unnamed: 0,id,url,Query,Title,Snippet
0,620,,Grupo Bimbo,Diário do Comércio,"Setenta anos mais tarde, seu neto Massimo é pr..."
1,823,,"Aice Mochi, Aice Klepon, Aice Jagung",Jens Rydqvist🇸🇪🇺🇲 on Twitter,Mixing religion and politics is like mixing ic...
2,505,,Tesla,"Twitter begins layoffs, cuts to affect 50% of ...",Tesla CEO Elon Musk closed the deal to acquire...
3,581,,"Verzani & Sandrini, Verzani Sandrini",Auricchio assina Ordem de Serviço para a revit...,Prefeito de São Caetano fez balanço das reform...
4,770,,Jalasoft,"Marcelo Claure, Rosario Paz, Ximena Behoteguy ...","So, he founded Jalasoft with only six engineer..."
...,...,...,...,...,...
395,913,,Rush Royale,mvc ajax form submit without refresh,Download Code Sample Download Free Word/PDF/Ex...
396,613,,El Economista,Link to page,"🔴 Ocurrió en Monterrey, mientras el economista..."
397,618,,Cencosud S.A.,Manual de Instalação da Imagem do Windows-7 [P...,PREVIEW PDF Titulo – Procedimento para Instala...
398,823,,"Aice Mochi, Aice Klepon, Aice Jagung",Deeper Purpose Community Church hosting carniv...,"They will also have bounce houses, pony rides,..."


Ниже нужно прочитать данные из файла в описании задачи

In [116]:
test_frame = pd.read_csv("fake_test_dataset.csv").reset_index(drop=True)
test_frame

Unnamed: 0,id,url,Query,Title,Snippet
0,620,,Grupo Bimbo,Diário do Comércio,"Setenta anos mais tarde, seu neto Massimo é pr..."
1,823,,"Aice Mochi, Aice Klepon, Aice Jagung",Jens Rydqvist🇸🇪🇺🇲 on Twitter,Mixing religion and politics is like mixing ic...
2,505,,Tesla,"Twitter begins layoffs, cuts to affect 50% of ...",Tesla CEO Elon Musk closed the deal to acquire...
3,581,,"Verzani & Sandrini, Verzani Sandrini",Auricchio assina Ordem de Serviço para a revit...,Prefeito de São Caetano fez balanço das reform...
4,770,,Jalasoft,"Marcelo Claure, Rosario Paz, Ximena Behoteguy ...","So, he founded Jalasoft with only six engineer..."
...,...,...,...,...,...
395,913,,Rush Royale,mvc ajax form submit without refresh,Download Code Sample Download Free Word/PDF/Ex...
396,613,,El Economista,Link to page,"🔴 Ocurrió en Monterrey, mientras el economista..."
397,618,,Cencosud S.A.,Manual de Instalação da Imagem do Windows-7 [P...,PREVIEW PDF Titulo – Procedimento para Instala...
398,823,,"Aice Mochi, Aice Klepon, Aice Jagung",Deeper Purpose Community Church hosting carniv...,"They will also have bounce houses, pony rides,..."


In [117]:
test_frame = test_frame.rename(columns={"Query": "query", "Title": "title", "Snippet": "snippet"})

In [118]:
test_input_ids = []
test_token_type_ids = []
test_attention_masks = []

for i, row in tqdm(test_frame.iterrows()):
    query = row["query"]
    document = row["title"] + " " + row["snippet"]
    
    encoded_dict = tokenizer.encode_plus(
        query, document,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_attention_mask=True, 
        return_tensors='pt',
   )
    
    test_input_ids.append(encoded_dict['input_ids'])
    test_token_type_ids.append(encoded_dict["token_type_ids"])
    test_attention_masks.append(encoded_dict['attention_mask'])

0it [00:00, ?it/s]

In [119]:
test_input_ids = torch.cat(test_input_ids, dim=0)
test_token_type_ids = torch.cat(test_token_type_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

test_dataset = TensorDataset(test_input_ids, test_token_type_ids, test_attention_masks)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

### Предсказания

In [124]:
model.eval()

predictions = []

for batch in tqdm(test_dataloader):
    b_input_ids = batch[0].to(device)
    b_token_type_ids = batch[1].to(device)
    b_input_mask = batch[2].to(device)
  
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, 
                        attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
  
    predictions.append(logits)

  0%|          | 0/13 [00:00<?, ?it/s]

In [127]:
probs = softmax(np.concatenate(predictions, axis=0), axis=1)[:, 1]
test_frame["relevance_probability"] = probs
test_frame[["id", "relevance_probability"]].to_csv("final_prediction.csv", index=False)