## The following code aims at finding the most similar external sentences with respect to the training set.

# Imports

In [None]:
!pip install transformers
!pip install sentence-transformers

In [None]:
from transformers import AutoConfig, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import time
import transformers

# Configuration

In [None]:
MAX_LEN = 256
ELECTRA_PATH = "google/electra-large-discriminator"
ROBERTA_LARGE_PATH = "roberta-large"
ROBERTA_BASE_PATH = "roberta-base"
TOKENIZER_ROBERTA = transformers.AutoTokenizer.from_pretrained(ROBERTA_LARGE_PATH)
TOKENIZER_ELECTRA = AutoTokenizer.from_pretrained(ELECTRA_PATH)

# Set up google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/My Drive/CommonLit/External Data"

/content/drive/My Drive/CommonLit/External Data


# Function to find similar texts

In [None]:
def top_k_most_similar_texts(queries, corpus_text, corpus_embedding, top_k, model_name = 'paraphrase-TinyBERT-L6-v2'):

  """
  Creates the embedding for the training set that you want to compare to external data and returns the 5 sentences 
  from the external data that are the most similar to each training sentences.

  Args:
      queries : list of training sentences
      corpus_text : list of external sentences
      corpus_embedding : list of the embeddings of external sentences
      top_k : the number of external sentences that you want to return from the external sentences and for each training sentences.
              For example: if top_k = 5 then if you have 100 training sentences it will return 500 sentences
      model_name : model used to encode the training sentences. Needs to be the same model used to encode external data

  """
  model = SentenceTransformer(model_name)
  print("Start encoding queries")
  queries_embedding = model.encode(queries, convert_to_tensor=True)
  print("Start to select sentences")
  selected_sentences = util.semantic_search(queries_embedding, corpus_embedding, top_k= top_k)

  selected = []
  for sentence in selected_sentences:
    sents = [corpus_text[s['corpus_id']] for s in sentence]
    selected.append(sents)

  return selected

In [None]:
def load_external_text_embedding(datasets):

  """
  Put all external sentences and all external embeddings into one list.

  Args:
      datasets (str) : the name of the dataset

  Example usage:

      embeddings, sentences = load_external_text_embedding(["wiki", "simple_wiki")
      Returns the embeddings and the sentences of simple_wiki and wikipedia datasets into one list.
  """
  
  embeddings = []
  sentences = []
  for data in datasets:
    print(f"Creating {data}")
    embed_dir = os.path.join("encoded_sentences",data + ".pt") 
    sentences_dir = os.path.join("preprocessed_data",data + ".csv") 
    encoded = torch.load(embed_dir)
    sentence = pd.read_csv(sentences_dir)
    embeddings.extend(encoded)
    sentences.extend(sentence.text.values)

  assert len(embeddings)==len(sentences)

  return embeddings, sentences


In [None]:
def zip_external_data_scores(selected_sentences, stdev, targets):

  zipped = []
  for index, sentence in enumerate(selected_sentences):
    data_scores = [(sent, targets[index], stdev[index]) for sent in sentence]
    zipped.extend(data_scores)

  return zipped

In [None]:
def create_folds(fold):

  """
  Returns the sentences, the targets and the standard deviations of the sentences which does not belong to the fold number specified in the parameter.

  Args:
      fold (int) : fold number of the future external data
  """
  
  df = pd.read_csv("train_folds.csv")
  train =  df[df.kfold!=fold] 
  targets = [float(t) for t in train.target.values]
  queries = [str(t) for t in train.excerpt.values]
  stdev = [float(t) for t in train.standard_error.values]

  return queries, targets, stdev

In [None]:
def run(datasets, top_k = 5, folds = 5):
  """
  Takes a list of datasets and returns the top k most similar sentences with their targets and standard deviations

  Args:
      datasets (list) : list of string for the name of the datasets
      top_k (int) : number of similar sentences for each training samples
      fodls (int) : number of folds
  """
  for fold in range(folds):
    start = time.time()

    print("Creating folds")
    queries, targets, stdev = create_folds(fold)
    print(f'Time: {time.time() - start}')
    print("Loading External Data")
    embeddings, sentences = load_external_text_embedding(datasets)
    print(f'Time: {time.time() - start}')
    print("Encoding queries and finding the top k most similar texts")

    selected_sentences = top_k_most_similar_texts(queries, sentences, embeddings, top_k = top_k)
    print(f'Time: {time.time() - start}')
    print("Zip chosen External Data with most similar sentences")

    zipped = pd.DataFrame(zip_external_data_scores(selected_sentences, stdev, targets))
    print(f'Time: {time.time() - start}')
    print("Saving")
    zipped.columns = ["sentences","targets", "stdev"]
    zipped.to_csv(f"queries_{fold}.csv")

  return zipped

In [None]:
# datasets = ["wikipedia", "simple_wikipedia", "onestop", "cbt"]
datasets = ["simple_wikipedia", "onestop", "wikipedia", "cbt"]
target = run(datasets)

# Pseudo labeling

### The following code aims at creating the pseudo labels for the external data. At the end, we also filter another time this external data so that it is closer to our training set.


## Dataset

In [None]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.sentences.values
        self.target = df.targets.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER_ROBERTA(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
class ElectraDataset:
    def __init__(self,df):
        self.excerpt = df.sentences.values
        self.target = df.targets.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER_ELECTRA(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
        token_type_ids = token_type_ids+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

## Models

In [None]:
class RobertaBaseAttention(nn.Module):
    
    def __init__(self, model_type="attention"):
        super(RobertaBaseAttention,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_BASE_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_BASE_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(768, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.layer_norm1 = nn.LayerNorm(768)
            self.linear1 = nn.Linear(768, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
            norm_context_vector = self.layer_norm1(context_vector)
            logits = self.linear1(norm_context_vector)
            logits = self.linear2(self.layer_norm2(logits)) 

        return logits

In [None]:
class RobertaLargeAttention(nn.Module):
    
    def __init__(self, model_type="attention"):
        super(RobertaLargeAttention,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_LARGE_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_LARGE_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
            norm_context_vector = self.layer_norm1(context_vector)
            logits = self.linear1(norm_context_vector)
            logits = self.linear2(self.layer_norm2(logits)) 

        return logits

In [None]:
class RobertaLargeMean(nn.Module):
    
    def __init__(self, model_type="mean"):
        super(RobertaLargeMean,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_LARGE_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_LARGE_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.linear = (nn.Linear(1024, 1))
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 768)
            self.linear2 = nn.Linear(768, 1)
            self.layer_norm2 = nn.LayerNorm(768)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
            logits = self.linear(context_vector)

        return logits

In [None]:
class ElectraLarge(nn.Module):
    
    def __init__(self, model_type="mean"):
        super(ElectraLarge,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ELECTRA_PATH)
        self.config.update({ 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7
                       })   
        
        self.electra = AutoModel.from_pretrained(ELECTRA_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.linear = (nn.Linear(1024, 1))
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 768)
            self.linear2 = nn.Linear(768, 1)
            self.layer_norm2 = nn.LayerNorm(768)
        
    def forward(self, ids, mask, token_type_ids, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.electra(input_ids=ids, attention_mask=mask, token_type_ids = token_type_ids)
            last_hidden_state = outputs.last_hidden_state
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        return logits

## Create Predictions

In [None]:
def inf_fn(data_loader_electra,data_loader_roberta, model, device):
  """
  Just a simple pytorch function to make predictions.

  Args:
      data_loader_electra (dataloader) : Training sentences tokenized with electra tokenizer
      data_loader_roberta (dataloader) : Training sentences tokenized with roberta tokenizer
      model (pytorch model) : the model you want to make predictions with
      device : gpu or cpu device
  """
    model.eval()

    if "Electra" in model.__class__.__name__ :

        with torch.no_grad():
            for index, d in enumerate(data_loader_electra):
                ids = d["ids"]
                mask = d["mask"]
                token_type_ids = d["token_type_ids"]

                ids = ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)

                outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)
                outputs = outputs.cpu().detach().numpy()

                if index == 0:
                    preds_test = outputs
                else:
                    preds_test = np.concatenate((preds_test,outputs), axis=None)

    else:

        with torch.no_grad():
                for index, d in enumerate(data_loader_roberta):
                    ids = d["ids"]
                    mask = d["mask"]

                    ids = ids.to(device, dtype=torch.long)
                    mask = mask.to(device, dtype=torch.long)

                    outputs = model(ids=ids, mask=mask)

                    outputs = outputs.cpu().detach().numpy()

                    if index == 0:
                        preds_test = outputs
                    else:
                        preds_test = np.concatenate((preds_test,outputs), axis=None)
          
    return preds_test

In [None]:
# create pytorch dataloader
def create_dataloader(df):

    electra_dataset = ElectraDataset(df)
    electra_dataloader = torch.utils.data.DataLoader(electra_dataset, batch_size= 16)
    roberta_dataset = RobertaDataset(df)
    roberta_dataloader = torch.utils.data.DataLoader(roberta_dataset, batch_size= 16)

    return electra_dataloader, roberta_dataloader

In [None]:
def create_model(device, name, path):
    
    if (name == "coderobertalargeattentionnorm2")|(name== "RobertaLargeModelsAttention"):

        model = RobertaLargeAttention().to(device)
        model.load_state_dict(torch.load(path))

    elif (name == "coderobertalargemean")|(name == "RobertaLargeModelsMean"):

        model = RobertaLargeMean().to(device)
        model.load_state_dict(torch.load(path))

    elif (name == "codeelectralargebaseline")|(name == "ElectraLargeModels"):

        model = ElectraLarge().to(device)
        model.load_state_dict(torch.load(path))

    elif (name == "coderobertabaseattentionnorm")|(name == "RobertaBaseModels"):

        model = RobertaBaseAttention().to(device)
        model.load_state_dict(torch.load(path))

    else:
        raise Exception(f"Unknown model: {name}")
    
    return model

In [None]:
def make_predictions(name, path, electra_dataloader, roberta_dataloader ):
    device = torch.device("cuda")
    model = create_model(device, name, path)
    model.load_state_dict(torch.load(path))

    results = inf_fn(electra_dataloader,roberta_dataloader , model, device)

    return results

## Create Predictions

In [None]:
# Weights for each model (based on my best submission in the competition)
dict_blend = {'ElectraLargeModels': 0.3, 'RobertaLargeModelsMean': 0.3, 'RobertaLargeModelsAttention': 0.2, 'RobertaBaseModels': 0.2}

In [None]:
def generate_predictions(df, fold_number, dict_blend):
  electra_dataloader, roberta_dataloader = create_dataloader(df)
  models = ["ElectraLargeModels", "RobertaLargeModelsMean", "RobertaLargeModelsAttention", "RobertaBaseModels"]
  print("Make Predictions")
  predictions = []
  for fold in range(5):
    for model in models:
      print(f"Inference for fold {fold} for model {model}")
      preds = make_predictions(model, f"models/{model}/model{fold}.bin", electra_dataloader, roberta_dataloader)
      predictions.append(dict_blend[model] * preds/5)

  final_preds = np.sum(predictions, axis=0)

  df["predictions"] = final_preds
  df.to_csv(f"pseudo_labels_fold_{fold_number}.csv", index = False)

  return df

In [None]:
def filter_sentences(df):
  """
  Based on the first solution on Commonlit, a great way to find sentences that are similar to the training set is to compare 
  the predictions of our best models with the target and standard deviation of its most similar text.

  This function enable to keep only the external data where the predictions given are lower targets+stdev and higher than targets-stdev.

  Returns:
      1 if we need to keep the sentence
      0 if not

  Args:
      df (dataframe) : dataframe with the following columns : sentences, targets, predictions, stdev
  """
  lower_bound = df.targets - df.stdev
  higher_bound = df.targets + df.stdev
  if (df.predictions>lower_bound) & (df.predictions<higher_bound):
    return 1
  else:
    return 0

In [None]:
def saved_filter_external_data(df,fold):
  df["to_keep"] = df.apply(lambda x: filter_sentences(x), axis=1)
  df.to_csv(f"pseudo_labels_fold_{fold}.csv", index=False)

In [None]:
def run_pseudo_labeling():
  
  for fold in range(5):
    path = f"queries_{fold}.csv"
    target = pd.read_csv(path)
    df = generate_predictions(target, fold_number = fold, dict_blend = dict_blend)
    saved_filter_external_data(df,fold)
    print(f"Pseudo Labeling Done for fold {fold}")

In [None]:
run_pseudo_labeling()