## SetUp

In [11]:
# !pip install -qq transformers
# !pip install -qq datasets
# !pip install -qq torch

In [12]:
# import libraries

import gc

import numpy as np
import pandas as pd 


from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import Trainer, TrainingArguments
from transformers.models.bert.modeling_bert import BertModel
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from datasets import Dataset


import torch.nn as nn
import torch.nn.functional as F
import torch

from sklearn.model_selection import train_test_split

In [13]:
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
ss = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

## Preprocessing and modelling

In [14]:
#initialize tokenizer and config from disk
tokenizer = AutoTokenizer.from_pretrained('../input/feadbackbase/outputs/tokenizer.h5/')
config = AutoConfig.from_pretrained('../input/feadbackbase/outputs/bert.h5/config.json')

In [15]:
N_FOLDS = 5
N_LABELS = 6
MAX_LENGHT = 512

In [16]:
def prepare_ds(train_df, val_df):
    '''Function converts train and validation dataframe to form required for modelling'''
    train_ds =  Dataset.from_pandas(train_df,preserve_index = False)
    val_ds = Dataset.from_pandas(val_df, preserve_index = False)
    train_ds = train_ds.map(tokenize_and_process)
    val_ds = val_ds.map(tokenize_and_process)
    encoded_train = train_ds.remove_columns(drop_cols)
    encoded_val = val_ds.remove_columns(drop_cols)
    encoded_train.set_format('torch')
    encoded_val.set_format('torch')
    return encoded_train, encoded_val

In [17]:
def tokenize_and_process(examples):
    text = examples['full_text']
#     seq_len = len(text)
#     seq_len = text_len if (text_len > tokenizer.model_max_length)  else tokenizer.model_max_length
    tokens = tokenizer(text, padding='max_length', max_length= MAX_LENGHT, truncation= True)
    if len(examples)> 3:
        tokens['labels'] = [examples[col] for col in target_cols]
#     tokens['seq_len'] = seq_len
    return tokens

In [18]:
target_cols = [col for col in list(train.columns)[2:]]
keep_cols = ['input_ids', 'attention_mask', 'labels'] # 'seq_len'
drop_cols = [col for col in train.columns if col not in keep_cols]

In [19]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class FeedBackModel(BertPreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        num_labels = config.num_labels
        self.bert =  BertModel(config)
        self.pool = MeanPooling()
#         self.drop = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, num_labels)
        self.embed = nn.Embedding(embedding_dim=768, num_embeddings=7000)
        
        self.init_weights()
#         self.bert.requires_grad_()
        
    def forward(self, input_ids = None, attention_mask = None, labels = None):
#         embed = self.embed(seq_len)
        z = self.bert(input_ids = input_ids, attention_mask = attention_mask)['last_hidden_state']
#         print((z[:, 0] + embed).shape)
#         outs = z[:, 0] + embed
        outs = self.pool(z, attention_mask)
#         outs = outs + embed
        outs = self.fc1(outs)
        if labels != None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(outs, labels)
            return {'logits' : outs , 'loss' : loss}
        return {'logits' : outs}

In [20]:
config.num_labels = N_LABELS # add number of labels to model configuration file
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # initialize device

In [21]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    mse = np.mean(np.square(labels- preds), axis = 0)
    rmse = np.sqrt(mse)
    mcrmse = np.mean(rmse, axis = 0)
    return {"mcrmse":  mcrmse}

In [24]:
# define trainging args
num_epochs = 4
batch_size = 1
logging_steps = (len(train)*0.8) // batch_size
model_name = 'feedback_base'
training_args = TrainingArguments(output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
                                  load_best_model_at_end = True,
                                  save_strategy='epoch',
                                  per_device_train_batch_size=batch_size,
                                  fp16 = False,learning_rate = 3e-5,
                                  per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
                                  save_steps=100, weight_decay=0.01, disable_tqdm=False,
                                  logging_steps=logging_steps, push_to_hub=False,report_to='none')

In [25]:
def prepare_test_ds(test_df):
    '''
    Function converts test dataframe to pytorch tensor for modelling
    
    inputs:
        test_df : pd.DataFrame
        
    outputs:
        encoded_test : Dataset
        test_ds : Dataset - used to extract test_ids
    
    '''
    test_ds = Dataset.from_pandas(test_df)
    tokenized_test = test_ds.map(tokenize_and_process)
    encoded_test =  tokenized_test.remove_columns(['full_text'])
    encoded_test.set_format('torch')
    return test_ds, encoded_test

In [26]:
test_ds, encode_test = prepare_test_ds(test)

# initialize datasets for trainer , wont be used
train_df, val_df = train_test_split(train, test_size = 0.2, random_state = 42)
encoded_train, encoded_val =  prepare_ds(train_df, val_df)

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/3128 [00:00<?, ?ex/s]

  0%|          | 0/783 [00:00<?, ?ex/s]

In [27]:
for fold in range(5):
    
    print(f"====== FOLD RUNNING {fold + 1}======")
    
    model = FeedBackModel.from_pretrained(f'../input/feadbackbase/outputs/bert_model{fold+1}.h5/', config = config).eval().to(device)
    
    trainer = CustomTrainer(model = model, args=training_args,
                       compute_metrics=compute_metrics,
                       train_dataset=encoded_train,
                       eval_dataset=encoded_val,
                       tokenizer=tokenizer)
    
    preds = trainer.predict(encode_test)[0]
    
    if fold == 0:
        final_preds = preds * (1/5)
    else:
        final_preds += preds * (1/5)
        
        
    del model
    gc.collect()
    torch.cuda.empty_cache()











In [28]:
# create submission file
# create submission file
sub_dict = {}
sub_dict['text_id'] = test_ds['text_id']
for i, col in enumerate(target_cols):
    sub_dict[col] = final_preds[:, i]
sub_df = pd.DataFrame(sub_dict)
sub_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.958575,2.667637,3.173141,2.968387,2.64165,2.839522
1,000BAD50D026,2.726518,2.439111,2.913637,2.640897,2.308726,2.733063
2,00367BB2546B,3.617669,3.468456,3.753231,3.684449,3.507948,3.694601


In [29]:
sub_df.to_csv('submission.csv', index = False)