In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn

import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import random
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold
from torch.utils.data import Dataset,DataLoader
import json
import re
from transformers import *
from tokenizers import *

  '"sox" backend is being deprecated. '


In [2]:
SEED = 42
def seed_everything(seed):
    print(f'setting everything to seed {seed}')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

setting everything to seed 42


In [3]:
DATA_PATH = "../input/coleridgeinitiative-show-us-the-data/"
sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [4]:

import re
import os
import json
import numpy as np


def load_text(id_, root=""):
    with open(os.path.join(root, id_ + ".json")) as f:
        text = json.load(f)
    return text


def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def locate_label_string(text, label):
    """
    Finds the label in the text
    """
    len_label = len(label) - 1

    candidates_idx = [i for i, e in enumerate(text) if e == label[1]]
    for idx in candidates_idx:
        if " " + text[idx: idx + len_label] == label:
            idx_start = idx
            idx_end = idx + len_label
            break

    assert (
        text[idx_start:idx_end] == label[1:]
    ), f'"{text[idx_start: idx_end]}" instead of "{label}" in "{text}"'

    char_targets = np.zeros(len(text))
    char_targets[idx_start:idx_end] = 1

    return idx_start, idx_end, char_targets


def locate_label_tokens(offsets, char_targets):
    """
    Finds the tokens corresponding to the found labels
    """
    target_idx = []
    for idx, (offset1, offset2) in enumerate(offsets):
        if sum(char_targets[offset1:offset2]) > 0:
            target_idx.append(idx)

    if not len(target_idx):
        for idx, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1:offset2]) > 0:
                target_idx.append(idx)

    return target_idx[0], target_idx[-1]

In [5]:
def create_tokenizer_and_tokens(config):
    if "roberta" in config.selected_model:
        raise NotImplementedError
        
    elif "albert" in config.selected_model:
        raise NotImplementedError
        
    else:
        tokenizer = BertWordPieceTokenizer(
            MODEL_PATHS[config.selected_model] + 'vocab.txt',
            lowercase=config.lowercase,
        )

        tokens = {
            'cls': tokenizer.token_to_id('[CLS]'),
            'sep': tokenizer.token_to_id('[SEP]'),
            'pad': tokenizer.token_to_id('[PAD]'),
        }
    
    return tokenizer, tokens

In [6]:
def process_data(
    text,
    label,
    tokenizer,
    tokens,
    max_len=100,
    model_name="bert",
):

    target_start, target_end = 0, 0
    text = " " + " ".join(str(text).split())
    label = " " + " ".join(str(label).split())

    if label != " ":
        idx_start, idx_end, char_targets = locate_label_string(
            text, label
        )

    tokenized = tokenizer.encode(text)
    input_ids_text = tokenized.ids[1:-1]

    offsets = tokenized.offsets[1:-1]

    if label != " ":
        target_start, target_end = locate_label_tokens(offsets, char_targets)

    # 如果target太长，只取后半截
    if target_end >= max_len - 2:  # target is too far in the sentence, we crop its beginning.
        n_tok_to_crop = target_start - max_len // 2
        new_str_start = offsets[n_tok_to_crop][0]

        input_ids_text = input_ids_text[n_tok_to_crop:]

        offsets = [tuple(t) for t in np.array(offsets[n_tok_to_crop:]) - new_str_start]
        text = text[new_str_start:]

        target_start -= n_tok_to_crop
        target_end -= n_tok_to_crop

    input_ids = (
        [tokens["cls"]]
        + input_ids_text[:max_len - 2]
        + [tokens["sep"]]
    )

    if "roberta" in model_name:
        token_type_ids = [0] * len(input_ids)
    else:
        token_type_ids = [1] * len(input_ids)

    text_offsets = [(0, 0)] + offsets[:max_len - 2] + [(0, 0)]

    target_start += 1
    target_end += 1

    # target_end = min(target_end, max_len - 1)

    assert len(input_ids) == len(token_type_ids) and len(input_ids) == len(text_offsets), (len(input_ids), len(text_offsets))
    
    padding_length = max_len - len(input_ids)
    
    if padding_length > 0:
        input_ids = input_ids + ([tokens["pad"]] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        text_offsets = text_offsets + ([(0, 0)] * padding_length)

    return {
        "ids": input_ids,
        "token_type_ids": token_type_ids,
        "targets_start": target_start,
        "targets_end": target_end,
        "text": text,
        "label": label,
        "offsets": text_offsets,
    }

In [7]:
from torch.utils.data import Dataset

class ArticleDataset(Dataset):
    """
    Dataset for inference. 
    """
    def __init__(
        self,
        id_,
        tokenizer,
        max_len=512,
        words_per_split=300,
        margin=10,
        model_name="bert",
        root=""
    ):
        self.tokenizer = tokenizer
        self.tokens = {
            'cls': tokenizer.token_to_id('[CLS]'),
            'sep': tokenizer.token_to_id('[SEP]'),
            'pad': tokenizer.token_to_id('[PAD]'),
        }
        self.max_len = max_len
        self.model_name = model_name
        self.words_per_split = words_per_split
        self.margin = margin

        self.article = load_text(id_, root=root)
        
        self.texts = self.article_to_texts()

    def __len__(self):
        return len(self.texts)
    
    def article_to_texts(self):
        """
        Each article is divided into sections, 
        and then into subsets of self.words_per_split words
        """
        texts = []
        for section in self.article:
            clean_section = clean_text(section['text']).split(' ')[:5000]  # only keep first 5k words
            
            for i in range(len(clean_section) // self.words_per_split + 1):
                start = max(0, self.words_per_split * i - self.margin)
                end = self.words_per_split * (i + 1) + self.margin
                text = " ".join(clean_section[start: end])
                texts.append(text)
            
        return texts

    def __getitem__(self, idx):
        data = process_data(
            self.texts[idx],
            "",
            self.tokenizer,
            self.tokens,
            max_len=self.max_len,
            model_name=self.model_name,
        )

        return {
            "ids": torch.tensor(data["ids"], dtype=torch.long),
            "token_type_ids": torch.tensor(data["token_type_ids"], dtype=torch.long),
            "target_start": torch.tensor(data["targets_start"], dtype=torch.long),
            "target_end": torch.tensor(data["targets_end"], dtype=torch.long),
            "text": data["text"],
            "label": data["label"],
            "offsets": torch.tensor(data["offsets"], dtype=torch.long),
        }

In [8]:
from transformers import BertModel, BertConfig

TRANSFORMERS = {   
    "bert-base-uncased": (BertModel, "bert-base-uncased", BertConfig),
}

MODEL_PATHS = {
    'bert-base-uncased': '../input/bertconfigs/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/',
    'bert-large-uncased-whole-word-masking-finetuned-squad': '../input/bertconfigs/wwm_uncased_L-24_H-1024_A-16/wwm_uncased_L-24_H-1024_A-16/',
    'albert-large-v2': '../input/albert-configs/albert-large-v2/albert-large-v2/',
    'albert-base-v2': '../input/albert-configs/albert-base-v2/albert-base-v2/',
    'distilbert': '../input/albert-configs/distilbert/distilbert/',
}

class ColeridgeModel(nn.Module):
    """
    Simple model for Question Answering
    """
    def __init__(self, model):
        super().__init__()
        self.name = model

        self.pad_idx = 1 if "roberta" in self.name else 0

        model_class, _, config_class = TRANSFORMERS[model]
        
        try:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        except:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'config.json')
        config.output_hidden_states = True

        self.transformer =  model_class(config)

        self.nb_features = self.transformer.pooler.dense.out_features

        self.logits = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features),
            nn.Tanh(),
            nn.Linear(self.nb_features, 2),
        )

    def forward(self, tokens, token_type_ids):
        """
        Usual torch forward function

        Arguments:
            tokens {torch tensor} -- Sentence tokens
            token_type_ids {torch tensor} -- Sentence tokens ids
        """

        hidden_states = self.transformer(
            tokens,
            attention_mask=(tokens != self.pad_idx).long(),
            token_type_ids=token_type_ids,
        )[-1]

        features = hidden_states[-1]
        logits = self.logits(features)

        start_logits, end_logits = logits[:, :, 0], logits[:, :, 1]

        return start_logits, end_logits

In [9]:
model = ColeridgeModel('bert-base-uncased')

In [10]:
class config:
    OVERLAP = 20
    MAX_LEN = 64
    MAX_LEN_INPUT = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 16
    EPOCHS = 5
    BERT_PATH = "../input/bert-base-uncased/"
    ROBERTA_PATH = "../input/roberta-base"
    MODEL_PATH = "model.bin"
    TRAINING_FILE = "../input/coleridgeinitiative-show-us-the-data/train.csv"
    TOKENIZER = BertWordPieceTokenizer(
        f"{BERT_PATH}/vocab.txt", 
        lowercase=True
    )

In [11]:
sample_submission

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,
1,2f392438-e215-4169-bebf-21ac4ff253e1,
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,


In [12]:
# soft = []
for fold in range(1):
    device = torch.device("cuda")
    model = ColeridgeModel('bert-base-uncased')
    model.load_state_dict(torch.load('../input/bert-uncase/model_4_0_0.06981216408579506.bin')['model'])
    model.to(device)
    tokenizer = config.TOKENIZER
    y_final = []
    text_final = []
    for text_id in tqdm(sample_submission['Id']):
        dataset = ArticleDataset(
            text_id,
            tokenizer,
            max_len=512,
            model_name="bert",
            root=paper_test_folder
        )
        
        train_data_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=4
        )
    
        print(f"Testing is Starting for fold={fold}")

        model.eval()
        
        with torch.no_grad():
            y_pred = []
            tx = []
            tk0 = tqdm(train_data_loader, total=len(train_data_loader))
            for bi, d in enumerate(tk0):
                orig_tweet = d["text"]
                offsets = d["offsets"]
                ids, token_type_ids = d["ids"], d["token_type_ids"]
                ids = ids.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device)
                
                outputs_start, outputs_end = model(
                    ids,
                    token_type_ids
                )
                
                outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
                outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
                for px, tweet in enumerate(orig_tweet):
                    if outputs_start[px, :].max()<0.5 or outputs_end[px, :].max()<0.5:
                        y_pred.append("")
                        continue
                    tempoff = offsets[px].numpy()
                    idx_start=np.argmax(outputs_start[px, :])
                    idx_end=np.argmax(outputs_end[px, :])
                    if idx_start>idx_end or (idx_end - idx_start)>10:
                        y_pred.append("")
                        continue
                    yp = []
                    for kk in range(idx_start,idx_end+1):
                        yp.append(tweet[tempoff[kk][0]:tempoff[kk][1]])
                    y_pred.append(' '.join(yp))
                tx.extend(list(orig_tweet))
        text_final.append(tx)
        y_final.append(y_pred)

  0%|          | 0/4 [00:00<?, ?it/s]

Testing is Starting for fold=0


  0%|          | 0/2 [00:00<?, ?it/s]

Testing is Starting for fold=0


  0%|          | 0/8 [00:00<?, ?it/s]

Testing is Starting for fold=0


  0%|          | 0/4 [00:00<?, ?it/s]

Testing is Starting for fold=0


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
# 去掉空格 找到原文对应的
finalpred = []
for k in range(len(sample_submission)):
    fs = []
    temp_text = text_final[k]
    temp_p = y_final[k]
    for i,tx in enumerate(temp_text):
        tempp = temp_p[i]
        temp = tempp.replace(" ","")
        if temp=="":
            fs.append("")
            continue
        temptext = tx.lower()
        text = temptext.replace(" ","")
        start = 0
        end = 0
        for ind in (i for i,e in enumerate(text) if e==temp[0]):
            if text[ind:ind+len(temp)] == temp:
                start = ind
                end = ind + len(temp)-1
                break
        # 还原
        finaltext = ""
        count = 0
        for a in temptext:
            if count>=start:
                finaltext += a
            if count>end:
                break
            if a!=" ":
                count+=1
                
        fs.append(finaltext.strip())
#         finalpred.append(finaltext.strip())
    finalpred.append(np.array(fs))

In [14]:
finalpred = np.array(finalpred)

  """Entry point for launching an IPython kernel.


In [15]:
## Literal Matching
## 训练集里面一共180个数据集标签
papers = {}
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

all_labels = set()
train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

# 字符串匹配

# literal_preds = []

for i,paper_id in enumerate(tqdm(sample_submission['Id'])):
#     paper = papers[paper_id]
#     text_1 = '. '.join(section['text'] for section in paper).lower()
#     text_2 = totally_clean_text(text_1)
    
#     labels = set()
#     finalset = set()
#     for label in all_labels:
#         if label in text_1 or label in text_2:
#             labels.add(clean_text(label))
#     if len(labels)!=0:
#         final = '|'.join(labels)
# #         finalset = labels
#     else:
    tempfi = finalpred[i][finalpred[i]!=""]
#         tempfi = list(tempfi) + list(labels) 
    final = '|'.join(set(tempfi))
#         tempfi = set(tempfi)
#         finalset = tempfi
    
#     tempset = finalset.copy()
#     pred_model_kept = []
#     for pred_m in tempfi:
#         kept = True
#         for pred_n in labels:
#             if pred_m in pred_n or pred_n in pred_m:
#                 kept = False
#         if kept:
#             pred_model_kept.append(pred_m)
#         else:
#             pass
#     for item in finalset:
#         for it in finalset:
#             if item!=it and it in item:
#                 if it in tempset:
#                    tempset.remove(it)

#     finalset = list(labels) + list(tempfi)
#     tempset = set(finalset)
#     final = '|'.join(tempset)
    sample_submission.loc[i,'PredictionString'] = final

No. different labels: 180


  0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
sample_submission.PredictionString.values

array(['birth cohort study',
       'trends in international mathematics and science study|common core of data|international standard classification of education',
       'slosh display|coastal erosion study|digital shoreline analysis system|north carolina floodplain mapping program|nc floodplain mapping program|slosh inundation|noaa storm surge inundation|noaa slr|slosh grid|hurricane preparedness and safety|coastal salinity database|coastal observation station|nc sea level rise risk management study|national geodetic survey',
       'rural urban continuum codes|rural urban residency'], dtype=object)

In [17]:
sample_submission.to_csv('submission.csv',index=False)

In [18]:
sample_submission.PredictionString.values

array(['birth cohort study',
       'trends in international mathematics and science study|common core of data|international standard classification of education',
       'slosh display|coastal erosion study|digital shoreline analysis system|north carolina floodplain mapping program|nc floodplain mapping program|slosh inundation|noaa storm surge inundation|noaa slr|slosh grid|hurricane preparedness and safety|coastal salinity database|coastal observation station|nc sea level rise risk management study|national geodetic survey',
       'rural urban continuum codes|rural urban residency'], dtype=object)