In [1]:
import pandas as pd 
import numpy as np 
import json 
import time
import seaborn as sns 
import math 
from matplotlib import pyplot as plt 
from tokenizers import ByteLevelBPETokenizer
import torch 
from torch import nn
from torch.utils.data import Dataset,DataLoader,TensorDataset
from transformers import RobertaModel,RobertaTokenizer,RobertaConfig
import os 
import random 
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm 
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
import pickle
tqdm.pandas()
import gc 
from tokenizers import ByteLevelBPETokenizer
gc.enable()

In [2]:
class PoolerStartLogits(nn.Module):
    def __init__(self,hidden_size):
        super(PoolerStartLogits,self).__init__()
        self.linear=nn.Linear(hidden_size,1)

    def forward(self,hidden_states):
        return self.linear(hidden_states)

class PoolerEndLogits(nn.Module):
    def __init__(self,hidden_size):
        super(PoolerEndLogits,self).__init__()
        self.sub_model=nn.Sequential(
            nn.Linear(hidden_size*2,hidden_size),
            nn.Tanh(),
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size,1)
        )

    def forward(self,hidden_states,start_positions=None,
              start_states=None):
        assert(
            start_positions is not None or state_states is not None
        ), "One of start states or start positions should be not None"
        if start_positions is not None:
          seq_length,hidden_size=hidden_states.size()[-2:]
          start_positions=start_positions[:,None,None].expand(-1,-1,hidden_size)
          start_states=hidden_states.gather(-2,start_positions)
          start_states=start_states.expand(-1,seq_length,-1)
        states=torch.cat((hidden_states,start_states),dim=-1)
        return self.sub_model(states)

class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.config=RobertaConfig.from_pretrained('../input/roberta-base',output_hidden_states=True)
        self.model=RobertaModel.from_pretrained('../input/roberta-base',config=self.config)
        self.hidden_size=self.model.config.hidden_size
        self.start_model=PoolerStartLogits(self.hidden_size)
        self.end_model=PoolerEndLogits(self.hidden_size)

    def forward(self,input_ids,attention_mask,start_positions=None,end_positions=None):
        output=self.model(input_ids=input_ids,attention_mask=attention_mask)
        last_hidden_states=output.last_hidden_state
        start_logits=self.start_model(last_hidden_states)
        end_logits=self.end_model(last_hidden_states,start_positions=start_positions)
        start_logits=start_logits.squeeze(dim=-1)
        end_logits=end_logits.squeeze(dim=-1)
        if start_poistions is None:
          loss_fn=nn.CrossEntropyLoss()
          start_loss=loss_fn(start_logits,start_positions)
          end_loss=loss_fn(end_logits,end_positions)
          loss=1/2*(start_loss+end_loss)
          return loss
        return start_logits,end_logits


In [3]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=Model().to(device)
model_state=torch.load('../input/coleridgepretrained/model_10.pth')
model.load_state_dict(model_state['model'])
model.eval()

Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [4]:
class Config:
  MAX_LENGTH=256
  WINDOWN_SIZE=28
  IS_LOWER=False

In [5]:
import re
def clean_text(text):
  return re.sub('[^A-Za-z0-9]+',' ',text.lower()).strip()

In [6]:
tokenizer=ByteLevelBPETokenizer(
    vocab='../input/roberta-base/vocab.json',
    merges='../input/roberta-base/merges.txt',
    add_prefix_space=False,
    lowercase=False
)

In [7]:
df_test=pd.DataFrame()
name_ids=[]
texts=[]
list_file=os.listdir('../input/coleridgeinitiative-show-us-the-data/test')
for file_name in tqdm(list_file):
  temp=json.load(open(f'../input/coleridgeinitiative-show-us-the-data/test/{file_name}','r'))
  text=""
  for section in temp:
    text+=section['text']+" "
  text=clean_text(text)
  ids=tokenizer.encode(text).ids
  n_samples=math.ceil(len(ids)//(Config.MAX_LENGTH-Config.WINDOWN_SIZE))
  for sample in range(n_samples):
    start=(Config.MAX_LENGTH-Config.WINDOWN_SIZE)*sample 
    end=min(len(ids),(Config.MAX_LENGTH-Config.WINDOWN_SIZE)*(sample+1))
    temp_ids=ids[start:end]
    text=tokenizer.decode(temp_ids)
    texts.append(text)
    name_ids.append(file_name[:-5])
df_test['id']=name_ids
df_test['text']=texts

  0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
vocab=tokenizer.get_vocab()
cls_token_idx=vocab['<s>']
sep_token_idx=vocab['</s>']
pad_token_idx=vocab['<pad>']

In [9]:
def convert_lines(tokenizer,df,max_sequence_length=512,is_test=False):
  count_temp=0
  outputs=np.zeros((len(df),max_sequence_length))
  type_outputs=np.zeros((len(df),max_sequence_length))
  position_outputs=np.zeros((len(df),2))
  offsets_outputs=np.ones((len(df),))
  for idx,row in tqdm(df.iterrows(),total=len(df)):
    token=tokenizer.encode(row.text,add_special_tokens=False)
    input_ids=[cls_token_idx]+token.ids+[sep_token_idx]
    offsets=[(0,0)]+token.offsets+[(0,0)]
    if len(input_ids)>max_sequence_length:
      input_ids=input_ids[:max_sequence_length]
      input_ids[-1]=sep_token_idx
    else:
      input_ids=input_ids+[pad_token_idx]*(max_sequence_length-len(input_ids))
    token_type_id=[0]*len(input_ids)
    outputs[idx,:]=np.array(input_ids)
    type_outputs[idx,:]=np.array(token_type_id)
    if is_test:
      continue
    label=row.label
    if len(label)==0:
      position_outputs[idx,:]=[0,0]
    else:
      start_idx,end_idx=find_index(row.text,label,offsets)
      if start_idx is None:
        start_idx,end_idx=0,0
        count_temp+=1
      position_outputs[idx,:]=[start_idx,end_idx]
  print(f"Number sentence:{count_temp}")
  if is_test:
    return outputs,type_outputs
  else:
    return outputs,type_outputs,position_outputs

In [10]:
input_ids,_=convert_lines(tokenizer,df_test,max_sequence_length=Config.MAX_LENGTH,is_test=True)

  0%|          | 0/248 [00:00<?, ?it/s]

Number sentence:0


In [11]:
test_dataset=TensorDataset(
          torch.tensor(input_ids,dtype=torch.long)
    )
test_loader=DataLoader(test_dataset,batch_size=32,shuffle=False,num_workers=2)

In [12]:
vocab=tokenizer.get_vocab()
pad_token_idx = vocab['<pad>']
cls_token_idx = vocab['<s>']
sep_token_idx = vocab['</s>']

In [13]:
with torch.no_grad():
  labels=[]
  for feature in test_loader:
    ids=feature[0].to(device)
    mask=(ids!=pad_token_idx)
    mask=mask.to(device)
    hidden_states=model.model(ids,mask).last_hidden_state
    start_logits=model.start_model(hidden_states).squeeze(dim=-1)
    start_indexs=torch.argmax(start_logits,dim=-1)
    end_logits=model.end_model(hidden_states,start_indexs).squeeze(dim=-1)
    end_indexs=torch.argmax(end_logits,dim=-1)
    start_indexs=start_indexs.cpu().detach().numpy()
    end_indexs=end_indexs.cpu().detach().numpy()
    ids=ids.cpu().detach().numpy()
    for i,input_id in enumerate(ids):
      if start_indexs[i]!=end_indexs[i]:
        result=tokenizer.decode(input_id[start_indexs[i]:end_indexs[i]+1])
        labels.append(result)
      else:
        labels.append("")


In [14]:
df_test['label']=labels
submissions=dict()
for id in set(df_test.id):
    labels=df_test[df_test.id==id].label.values
    result=[]
    for label in labels:
        temp=clean_text(label)
        if len(temp)>0 and temp not in result:
            result.append(temp)
    result='|'.join(result)
    submissions[id]=result

In [15]:
sub=pd.DataFrame()
sub['Id']=submissions.keys()
sub['PredictionString']=submissions.values()
sub

Unnamed: 0,Id,PredictionString
0,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes
1,2100032a-7c33-4bff-97ef-690822c43466,adni
2,2f392438-e215-4169-bebf-21ac4ff253e1,trends in international mathematics and scienc...
3,3f316b38-1a24-45a9-8d8c-4e05a42257c6,nc sea level rise risk management study|noaa s...


In [16]:
sub.to_csv('submission.csv',index=False)