In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn

import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import random
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold
from torch.utils.data import Dataset,DataLoader
import json
import re

In [2]:
SEED = 42
def seed_everything(seed):
    print(f'setting everything to seed {seed}')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

setting everything to seed 42


In [3]:
train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [4]:
class config:
    OVERLAP = 20
    MAX_LEN = 64
    MAX_LEN_INPUT = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 16
    EPOCHS = 5
    BERT_PATH = "../input/bert-base-uncased/"
    ROBERTA_PATH = "../input/roberta-base"
    MODEL_PATH = "model.bin"
    TRAINING_FILE = "../input/coleridgeinitiative-show-us-the-data/train.csv"
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        f"{BERT_PATH}/vocab.txt", 
        lowercase=True
    )

In [5]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union


connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

In [6]:
def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    MAX_LENGTH = config.MAX_LEN
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - config.OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

# 制作训练集
# Index(['Id', 'pub_title', 'dataset_title', 'dataset_label', 'cleaned_label'], dtype='object')

all_train_data = []
all_index = []
all_dataset = []
for paper_id in tqdm(sample_submission['Id'].unique()):
    # load paper
    temps = train[train['Id']==paper_id]
    
    paper = json.load(open(f'{paper_test_folder}/{paper_id}.json'))
    
    # extract sentences
    sentences = set([clean_paper_sentence(sentence) for section in paper 
                     for sentence in section['text'].split('.')
                    ])
    
    # 缩短句子
    
    sentences = shorten_sentences(sentences) # make sentences short
    
    # 筛选句子
    # sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # 选出带有data，study的句子
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['sequences','catalog','database','survey','initiative','study','practices','census','program','science','hub','collection'
'research','model','development'])]
    # sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in all_labels)]

    for sentence in sentences:
        all_train_data.append(sentence)
        all_index.append(paper_id)
#     # 分词
#     sentences = [sentence.split() for sentence in sentences] # sentence = list of words
    
    
#     # mask
#     test_data = []
#     for sentence in sentences:
#         for phrase_start, phrase_end in find_mask_candidates(sentence):
#             dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
#             test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)
    
#     all_test_data.append(test_data)

  0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
train_data = pd.DataFrame(columns=['Id', 'sentence'])
train_data['Id'] = all_index
train_data['sentence'] = all_train_data

In [8]:
tokenizer = config.TOKENIZER
tw =train_data.sentence.values[0]
# tweet_offsets = s.offsets[1:-1]

In [9]:
def process_data(tweet, tokenizer, max_len):
    # sentiment -> title
    tweet = tweet.lower()
    # idxs0,idx1 = None,None
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids[1:-1]
    tweet_offsets = tok_tweet.offsets[1:-1]
    
    input_ids = [101]  + input_ids_orig + [102]

    mask = [1] * len(input_ids)
    
    tweet_offsets = [(0, 0)] * 1 + tweet_offsets + [(0, 0)]

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)

    if len(input_ids) > max_len:
        input_ids = input_ids[:max_len]
        mask = mask[:max_len]
        tweet_offsets = tweet_offsets[:max_len]
    
    return {
        'ids': input_ids,
        'mask': mask,
#         'token_type_ids': token_type_ids, 
        'orig_tweet': tweet,
#         'orig_selected': selected_text,
        'offsets': tweet_offsets
    }

In [10]:
class ColeridgeDataset:
    def __init__(self, tweet):
        self.tweet = tweet
#         self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN_INPUT
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
#             self.selected_text[item], 
            self.tokenizer,
            self.max_len
        )
        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
#             'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
#             'orig_selected': data["orig_selected"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

In [11]:
class ColeridgeModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(ColeridgeModel, self).__init__(conf)
#         self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids=None):
        out = self.bert(
            ids,
            attention_mask=mask,
#             token_type_ids=token_type_ids
        )
        out = out[2]
        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [12]:
train_dataset = ColeridgeDataset(
    tweet=train_data.sentence.values
    #     selected_text=df_train.selected_text.values
)

In [13]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss

In [14]:
def compute_fbeta(y_true,
                  y_pred,
                  beta: float = 0.5) -> float:
    """Compute the Jaccard-based micro FBeta score.

    References
    ----------
    - https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation
    """

    def _jaccard_similarity(str1: str, str2: str) -> float:
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        if (len(a) + len(b) - len(c))<=0:
            return 0
        return float(len(c)) / (len(a) + len(b) - len(c))

    tp = 0  # true positive
    fp = 0  # false positive
    fn = 0  # false negative
    for ground_truth_list, predicted_string_list in zip(y_true, y_pred):
        predicted_string_list_sorted = sorted(predicted_string_list)
        for ground_truth in sorted(ground_truth_list):            
            if len(predicted_string_list_sorted) == 0:
                fn += 1
            else:
                similarity_scores = [
                    _jaccard_similarity(ground_truth, predicted_string)
                    for predicted_string in predicted_string_list_sorted
                ]
                matched_idx = np.argmax(similarity_scores)
                if similarity_scores[matched_idx] >= 0.5:
                    predicted_string_list_sorted.pop(matched_idx)
                    tp += 1
                else:
                    fn += 1
        fp += len(predicted_string_list_sorted)

    tp *= (1 + beta ** 2)
    fn *= beta ** 2
    fbeta_score = tp / (tp + fp + fn)
    return fbeta_score

In [15]:
from sklearn import metrics

def get_score(y_true, y_pred):
    scores = []
    acc = []
    for i in range(y_true.shape[1]):
        score = metrics.average_precision_score(y_true[:,i], y_pred[:,i])
        scores.append(score)
        yac = y_pred[:,i]>0.5
        acc.append(np.sum(yac==y_true[:,i])/y_true.shape[0])
    avg_score = np.mean(scores)
#     lb = y_true.argmax(axis=1)
#     prd = y_pred.argmax(axis=1)
#     acc = np.sum(lb==prd)/y_true.shape[0]
    scores.append(np.mean(acc))
    scores.extend(acc)
    return avg_score, scores

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
               
def HPA_auc(y_true, y_valid):
    return get_score(y_true,y_valid)
        
class HPAAucMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.y_true = None
        self.y_pred = None
        self.score = 0
        self.allscore = []
    def update(self, y_true, y_pred):
        y_true = y_true.cpu().numpy().astype(int)
        y_pred = y_pred.sigmoid().data.cpu().numpy()
        if self.y_true is None:
            self.y_true = y_true
            self.y_pred = y_pred
        else:
            self.y_true = np.concatenate((self.y_true, y_true))
            self.y_pred = np.concatenate((self.y_pred, y_pred))
        try:
            self.score,self.allscore = HPA_auc(self.y_true, self.y_pred)
        except:
            pass
    @property
    def avg(self):
        print(self.allscore[:19])
        print('acc---------------')
        print(self.allscore[20:])
        return self.score
    @property
    def alls(self):
        return self.allscore

In [16]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
#         token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
#         jaccard_scores = []
        
        y_true = []
        y_pred = []
        for px, tweet in enumerate(orig_tweet):
            tempoff = offsets[px].numpy()
            idx_start=np.argmax(outputs_start[px, :])
            idx_end=np.argmax(outputs_end[px, :])
            yt = []
            yp = []
            
            for kk in range(idx_start,idx_end+1):
#                 print(tempoff[kk])
                yp.append(tweet[tempoff[kk][0]:tempoff[kk][1]])
            for kk in range(targets_start[px],targets_end[px]):
#                 print(tempoff[kk])
                yt.append(tweet[tempoff[kk][0]: tempoff[kk][1]])
            y_pred.append(' '.join(yp))
            y_true.append(' '.join(yt))
        # y_true
        # y_pred
#         for px, tweet in enumerate(orig_tweet):
#             selected_tweet = orig_selected[px]
#             tweet_sentiment = sentiment[px]
#             jaccard_score, _ = calculate_jaccard_score(
#                 original_tweet=tweet,
#                 target_string=selected_tweet,
#                 sentiment_val=tweet_sentiment,
#                 idx_start=np.argmax(outputs_start[px, :]),
#                 idx_end=np.argmax(outputs_end[px, :]),
#                 offsets=offsets[px]
#             )
#             jaccard_scores.append(jaccard_score)

        jaccards.update(compute_fbeta(y_true,y_pred), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg,jaccards=jaccards.avg)

In [17]:
def eval_fn(data_loader, model, device):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            mask = d["mask"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"]

            ids = ids.to(device, dtype=torch.long)
    #         token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)
    
            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
#                 token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
#             jaccard_scores = []
#             for px, tweet in enumerate(orig_tweet):
#                 selected_tweet = orig_selected[px]
#                 tweet_sentiment = sentiment[px]
#                 jaccard_score, _ = calculate_jaccard_score(
#                     original_tweet=tweet,
#                     target_string=selected_tweet,
#                     sentiment_val=tweet_sentiment,
#                     idx_start=np.argmax(outputs_start[px, :]),
#                     idx_end=np.argmax(outputs_end[px, :]),
#                     offsets=offsets[px]
#                 )
#                 jaccard_scores.append(jaccard_score)

            y_true = []
            y_pred = []
            for px, tweet in enumerate(orig_tweet):
                tempoff = offsets[px].numpy()
                idx_start=np.argmax(outputs_start[px, :])
                idx_end=np.argmax(outputs_end[px, :])
                yt = []
                yp = []

                for kk in range(idx_start,idx_end+1):
    #                 print(tempoff[kk])
                    yp.append(tweet[tempoff[kk][0]:tempoff[kk][1]])
                for kk in range(targets_start[px],targets_end[px]):
    #                 print(tempoff[kk])
                    yt.append(tweet[tempoff[kk][0]: tempoff[kk][1]])
                y_pred.append(' '.join(yp))
                y_true.append(' '.join(yt))

            jaccards.update(compute_fbeta(y_true,y_pred), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg,jaccards=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    return jaccards.avg

In [18]:
# train_data['fold'] = 0
# gf = GroupKFold(5)
# for i,(train_ind,val_ind) in enumerate(gf.split(train_data.index,train_data.data_count,train_data.Id)):
#     train_data.loc[val_ind,'fold'] = i
# train_data.fold.value_counts()

In [19]:
# soft = []
for fold in range(1):
#     dfx = pd.read_csv(config.TRAINING_FILE)

    train_dataset = ColeridgeDataset(
        tweet = train_data.sentence.values,
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )
    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = ColeridgeModel(conf=model_config)
    model.load_state_dict(torch.load('../input/bert-uncase/model_4_0_0.6519251559696981.bin')['model'])
    model.to(device)
    
    print(f"Testing is Starting for fold={fold}")
    
    model.eval()
    
    with torch.no_grad():
        y_pred = []
        tk0 = tqdm(train_data_loader, total=len(train_data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            mask = d["mask"]
            orig_tweet = d["orig_tweet"]
            offsets = d["offsets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
    
            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
#                 token_type_ids=token_type_ids
            )
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
#             soft.append([outputs_start,outputs_end])
#             jaccard_scores = []
#             for px, tweet in enumerate(orig_tweet):
#                 selected_tweet = orig_selected[px]
#                 tweet_sentiment = sentiment[px]
#                 jaccard_score, _ = calculate_jaccard_score(
#                     original_tweet=tweet,
#                     target_string=selected_tweet,
#                     sentiment_val=tweet_sentiment,
#                     idx_start=np.argmax(outputs_start[px, :]),
#                     idx_end=np.argmax(outputs_end[px, :]),
#                     offsets=offsets[px]
#                 )
#                 jaccard_scores.append(jaccard_score)

            for px, tweet in enumerate(orig_tweet):
                if outputs_start[px, :].max()<0.9 or outputs_end[px, :].max()<0.9:
                    y_pred.append("")
                    continue
                tempoff = offsets[px].numpy()
                idx_start=np.argmax(outputs_start[px, :])
                idx_end=np.argmax(outputs_end[px, :])
                if idx_start>idx_end or (idx_end - idx_start)>12:
                    y_pred.append("")
                    continue
                yp = []
                for kk in range(idx_start,idx_end+1):
                    yp.append(tweet[tempoff[kk][0]:tempoff[kk][1]])
                y_pred.append(' '.join(yp))

Testing is Starting for fold=0


  0%|          | 0/7 [00:00<?, ?it/s]

In [20]:
# 去掉空格 找到原文对应的
finalpred = []
for k in range(len(train_data)):
    temp = y_pred[k].replace(" ","")
    if temp=="":
        finalpred.append("")
        continue
    temptext = train_data.sentence.values[k]
    temptext = temptext.lower()
    text = temptext.replace(" ","")
    start = 0
    end = 0
    for ind in (i for i,e in enumerate(text) if e==temp[0]):
        if text[ind:ind+len(temp)] == temp:
            start = ind
            end = ind + len(temp)-1
            break
    # 还原
    finaltext = ""
    count = 0
    for a in temptext:
        if count>=start:
            finaltext += a
        if count>end:
            break
        if a!=" ":
            count+=1
    finalpred.append(finaltext.strip())

In [21]:
finalpred = np.array(finalpred)

In [22]:
finalpred

array(['', '', '', '', '', '', '', '', '', '', '',
       'alzheimer s disease neuroimaging initiative adni', '', '', '', '',
       '', '', '', '', '', '', '', '', '',
       'trends in international mathematics and science study', '', '',
       '', '', '', '', '', '', '',
       'trends in international mathematics and science study', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '',
       'trends in international mathematics and science study', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', 'trends in international mathematics and science study', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', 'trends in t

In [23]:
## Literal Matching
## 训练集里面一共180个数据集标签
papers = {}
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

# 字符串匹配

# literal_preds = []

for paper_id in tqdm(sample_submission['Id']):
#     paper = papers[paper_id]
#     text_1 = '. '.join(section['text'] for section in paper).lower()
#     text_2 = totally_clean_text(text_1)
    
#     labels = set()
#     for label in all_labels:
#         if label in text_1 or label in text_2:
#             labels.add(clean_text(label))
#     if len(labels)!=0:
#         final = '|'.join(labels)
#     else:
    indexes = train_data[train_data.Id==paper_id].index.values
    tempfi = finalpred[indexes][finalpred[indexes]!=""]
#     tempfi = list(tempfi) + list(labels)
    fnti = set()
    for s in tempfi:
        fnti.add(totally_clean_text(s))
    final = '|'.join(fnti)
    sample_submission.loc[sample_submission.Id==paper_id,'PredictionString'] = final

No. different labels: 180


  0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
sample_submission.to_csv('submission.csv',index=False)

In [25]:
sample_submission.PredictionString.values

array(['alzheimer s disease neuroimaging initiative adni',
       'nces common core of data|progress in international reading literacy study|international standard classification of education|trends in international mathematics and science study|trends in teacher preparation',
       'coastal erosion study|north carolina flood mapping program|slosh model|slosh basin|slosh storm|coastal resources commission|coastal change hazards portal|north carolina floodplain mapping program|slosh display|slosh grids|cape hatteras national seashore',
       ''], dtype=object)