In [None]:
!nvidia-smi

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [4]:
import os 
import numpy as np 
import pandas as pd 
import torch 
from torch import nn
from transformers import RobertaModel,RobertaTokenizer,get_linear_schedule_with_warmup,AdamW,AutoConfig,get_cosine_schedule_with_warmup
from sklearn.model_selection import KFold
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm 
import time 
from pathlib import Path
import random
import gc
from sklearn.metrics import mean_squared_error
import seaborn as sns
from matplotlib import pyplot as plt
gc.enable()

In [None]:
os.environ['KAGGLE_USERNAME']="daominhkhanh"
os.environ['KAGGLE_KEY']="a6b5596f1c2c9adbeda916b0ec27faba"
!kaggle competitions download -c commonlitreadabilityprize

In [None]:
!unzip train.csv.zip

#Constrant

In [7]:
K_FOLDS=5
BATCH_SIZE=8
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED=1000
MAX_LENGTH=300

#Data

In [8]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [9]:
train['length']=train.excerpt.astype(str).apply(lambda x: len(x.split()))
test['length']=test.excerpt.astype(str).apply(lambda x: len(x.split()))

In [None]:
train.describe()

In [None]:
sns.countplot(train.length)

In [None]:
train['number_sentence']=train.excerpt.astype(str).apply(lambda x: len(x.split('.')))
test['number_sentence']=test.excerpt.astype(str).apply(lambda x:len(x.split('.')))

In [None]:
sns.countplot(train.number_sentence)

In [14]:
train.drop(['url_legal','license','standard_error'],axis=1,inplace=True)
test.drop(['url_legal','license'],axis=1,inplace=True)

In [None]:
tokenizer_base=RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_large=RobertaTokenizer.from_pretrained('roberta-large')

In [16]:
train['length_encode']=train.excerpt.astype(str).apply(lambda x: len(tokenizer_base.encode(x)))

In [17]:
train.describe()

Unnamed: 0,target,length,number_sentence,length_encode
count,2834.0,2834.0,2834.0,2834.0
mean,-0.959319,172.982004,10.034933,217.831334
std,1.033579,16.97439,3.977946,24.947523
min,-3.676268,135.0,3.0,157.0
25%,-1.69032,159.0,7.0,200.0
50%,-0.91219,175.0,9.0,217.5
75%,-0.20254,188.0,12.0,235.0
max,1.71139,205.0,35.0,322.0


In [None]:
temp=train.excerpt.values.tolist()[10]
print(temp)

In [None]:
token=token=tokenizer_base.encode_plus(
        temp,
        padding=False,
        return_attention_mask=True,
        return_tensors='pt'
    )

In [None]:
print(token.input_ids.size())
print(token.attention_mask.size())

# DataLoader

In [21]:
def convert_features(excerpt,tokenizer):
    token=tokenizer.encode_plus(
        excerpt,
        padding=False,
        return_attention_mask=True,
        #return_token_type_ids=True,
        return_tensors='pt'
    )
    return token['input_ids'].flatten(),token['attention_mask'].flatten()

class ColeridgeInitiative(Dataset):
    def __init__(self,data,tokenizer,is_test=False):
        self.tokenizer=tokenizer
        self.data=data
        self.is_test=is_test
        self.excerpts=self.data.excerpt.values.tolist()
        if is_test is False:
            self.targets=self.data.target.values.tolist()
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):

        if self.is_test is False:
            excerpt,target=self.excerpts[idx],self.targets[idx]
            excerpt=excerpt.replace("\n"," ")
            input_ids,attention_mask=convert_features(excerpt,self.tokenizer)
            return input_ids,attention_mask,target
        else:
            excerpt=self.excerpts[idx]
            input_ids,attention_mask=convert_features(excerpt,self.tokenizer)
            return input_ids,attention_mask

class MyCollate:
  def __init__(self,pad_idx_input=1,pad_idx_attention=0,is_test=False):
    self.pad_idx_input=pad_idx_input
    self.pad_idx_attention=pad_idx_attention
    self.is_test=is_test

  def __call__(self,batch):
    input_ids=[item[0] for item in batch]
    attention_masks=[item[1] for item in batch]
    input_ids=pad_sequence(input_ids,batch_first=True,padding_value=self.pad_idx_input)
    attention_masks=pad_sequence(attention_masks,batch_first=True,padding_value=self.pad_idx_attention)
    if self.is_test:
      return input_ids,attention_masks
    else:
      targets=[item[2] for item in batch]
      return input_ids,attention_masks,torch.tensor(targets,dtype=torch.float32)


In [22]:
class Model(nn.Module):
  def __init__(self,is_large=False,
               hidden_size=256,
               output_hidden_states=True,
               multisample_dropout=False               
               ):
    super(Model,self).__init__()
    if is_large is False:
      self.bert=RobertaModel.from_pretrained('roberta-base',
                                             output_hidden_states=output_hidden_states
                                    )
    else:
      self.bert=RobertaModel.from_pretrained('roberta-large',
                                             output_hidden_states=output_hidden_states
      )
    self.hidden_size=hidden_size
    self.lstm=nn.LSTM(input_size=self.bert.config.hidden_size,
                     hidden_size=self.hidden_size,
                     num_layers=2,
                     batch_first=True,
                     bidirectional=True
                    )
    #self.norm=nn.LayerNorm(2*self.hidden_size)
    self.linear=nn.Linear(4*self.hidden_size,1)

  # def __init_weight(self,module):
  #   if isinstance(module,nn.Linear):
  #     module.weight.data.normal_(mean=0,std=self.bert.config.initializer_range)
  #     if module.bias is not None:
  #       module.bias.data.zero_()
  #   elif isinstance(module,nn.LayerNorm):
  #     module.bias.data.zero_()
  #     module.weight.data.fill_(1.0)

  # def fine_tunning(self,fine_tune=False):
  #   for child in self.bert.children():
  #     for param in child.parameters():
  #       param.requires_grad=fine_tune 

  def forward(self,input_ids,attention_masks,token_type_ids=None):
    robeta_output=self.bert(input_ids,attention_masks)
    last_hidden_state=robeta_output.last_hidden_state
    lstm_out,(hn,cn)=self.lstm(last_hidden_state)#batch_size*seq_length*hidden_size
    lstm_mean=torch.mean(lstm_out,dim=1)
    lstm_max,_=torch.max(lstm_out,dim=1)
    out=torch.cat((lstm_mean,lstm_max),dim=1)    
    return self.linear(out)

In [33]:
def evaluate(model,data_loader,loss_fn):
  model.eval()
  mse_sum=0
  with torch.no_grad():
    for features in data_loader:
      input_ids,attention_mask,targets=features[0].to(DEVICE),features[1].to(DEVICE),features[2].to(DEVICE)
      output=model(input_ids,attention_mask)
      loss=torch.sqrt(loss_fn(output.flatten(),targets))
      mse_sum+=loss.item()
  return mse_sum/len(data_loader)

In [34]:
def predict(model,data_loader):
  model.eval()
  preds=[]
  with torch.no_grad():
    for features in data_loader:
      input_ids,attention_mask,targets=features[0].to(DEVICE),features[1].to(DEVICE),features[2].to(DEVICE)
      outputs=model(input_ids,attention_mask,token_type_ids)
      preds.append(outputs.cpu().detach().numpy())
  return np.concatenate(preds)

In [35]:
def train_model(model,optimizer,scheduler,loss_fn,train_loader,val_loader,epochs):
    for epoch in range(epochs):
      model.train()
      mse_train=0
      best_mse_val=None
      start_time=time.time()
      for features in train_loader:
        input_ids,attention_mask,targets=features[0].to(DEVICE),features[1].to(DEVICE),features[2].to(DEVICE)
        outputs=model(input_ids,attention_mask)
        loss=loss_fn(outputs.flatten(),targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        mse_train+=loss.item()
        if scheduler:
          scheduler.step()
      val_mse=evaluate(model,val_loader,loss_fn)
      print(f"Epoch:{epoch}---Train:{mse_train/len(train_loader)}---Val:{val_mse}---Time:{time.time()-start_time}")
      if best_mse_val is None or val_mse<best_mse_val:
        model_state_dict=model.state_dict()
    return model_state_dict



In [36]:
# def set_random_seed(random_seed):
#     random.seed(random_seed)
#     np.random.seed(random_seed)
#     os.environ["PYTHONHASHSEED"] = str(random_seed)
#     torch.manual_seed(random_seed)
#     torch.cuda.manual_seed(random_seed)
#     torch.cuda.manual_seed_all(random_seed)

#     torch.backends.cudnn.deterministic = True

In [37]:
def get_loader(train_idxs,val_idxs,is_large):
  df_train=train[train.index.isin(train_idxs)].reset_index(drop=True)
  df_val=train[train.index.isin(val_idxs)].reset_index(drop=True)
  if is_large:
    train_dataset=ColeridgeInitiative(df_train,tokenizer_large)
    val_dataset=ColeridgeInitiative(df_val,tokenizer_large)
  else:
    train_dataset=ColeridgeInitiative(df_train,tokenizer_base)
    val_dataset=ColeridgeInitiative(df_val,tokenizer_base)

  train_loader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=2,collate_fn=MyCollate())
  val_loader=DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=2,collate_fn=MyCollate())
  return train_loader,val_loader

In [38]:
def get_model(is_large,len_train_loader,epochs,lr,need_scheduler=True):
  model=Model(is_large=is_large).to(DEVICE)
  #optimizer=create_optimizer(model)
  optimizer=AdamW(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=1e-2)
  train_steps=len_train_loader*epochs
  if need_scheduler:
    scheduler=get_linear_schedule_with_warmup(
        optimizer,
        num_training_steps=train_steps,
        num_warmup_steps=50
    )
  else:
    scheduler=None
  loss_fn=nn.MSELoss(reduction='mean')
  return model,optimizer,scheduler,loss_fn

In [39]:
def save_checkpoint(model_state_dict,path,model_name):
    path='/content/drive/MyDrive/Kaggle/Model/'+path
    if os.path.exists(path) is False:
        Path(path).mkdir(parents=True,exist_ok=True)

    model=torch.save(model_state_dict,path+'/'+model_name)
    print("Save model done")

In [40]:
lrs=[1e-5,2e-5,1.5e-5,1e-4,2e-4]

In [41]:
def train_k_folds(is_large=False,epochs=10):
  kfold=KFold(n_splits=K_FOLDS,random_state=SEED,shuffle=True)
  for fold,(train_idxs,val_idxs) in enumerate(kfold.split(train)):
    print('--'*10,f'TRAIN FOLD {fold}','--'*10)
    train_loader,val_loader=get_loader(train_idxs,val_idxs,is_large)
    model,optimizer,scheduler,loss_fn=get_model(is_large,len(train_loader),epochs,lrs[fold])
    model_state_dict=train_model(model,optimizer,scheduler,loss_fn,train_loader,val_loader,epochs)
    if is_large is True:
      model_name='large'
    else:
      model_name='base'
    save_checkpoint(model_state_dict,'CommonLit'+model_name,'model'+model_name+'_{}_fold{}.pth'.format(fold,epochs))
    del model,optimizer,scheduler,loss_fn,train_loader,val_loader
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
train_k_folds(is_large=False)

In [None]:
train_k_folds(is_large=True)