In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tqdm import trange

In [2]:
train = pd.read_csv('train.csv')

In [3]:
print('Train shape: ', train.shape)
train.head()

Train shape:  (144293, 8)


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [4]:
print('The train labels are:')
train.discourse_type.unique()

The train labels are:


array(['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
       'Counterclaim', 'Rebuttal'], dtype=object)

In [5]:
IDS = train.id.unique()
print('There are', len(IDS), 'train texts.')

There are 15594 train texts.


In [7]:
VER = 12
EPOCHS = 5
N_SPLITS = 5
MAX_LEN = 1024
LRS = [1e-4, 1e-4, 1e-4, 1e-4, 1e-5]
LOAD_TOKENS_FROM = None
DOWNLOADED_MODEL_PATH = 'roberta-base'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(DOWNLOADED_MODEL_PATH)
train_tokens = np.zeros((len(IDS),MAX_LEN), dtype='int32')
train_attention = np.zeros((len(IDS),MAX_LEN), dtype='int32')

# THE 14 CLASSES FOR NER
lead_b = np.zeros((len(IDS),MAX_LEN))
lead_i = np.zeros((len(IDS),MAX_LEN))

position_b = np.zeros((len(IDS),MAX_LEN))
position_i = np.zeros((len(IDS),MAX_LEN))

evidence_b = np.zeros((len(IDS),MAX_LEN))
evidence_i = np.zeros((len(IDS),MAX_LEN))

claim_b = np.zeros((len(IDS),MAX_LEN))
claim_i = np.zeros((len(IDS),MAX_LEN))

conclusion_b = np.zeros((len(IDS),MAX_LEN))
conclusion_i = np.zeros((len(IDS),MAX_LEN))

counterclaim_b = np.zeros((len(IDS),MAX_LEN))
counterclaim_i = np.zeros((len(IDS),MAX_LEN))

rebuttal_b = np.zeros((len(IDS),MAX_LEN))
rebuttal_i = np.zeros((len(IDS),MAX_LEN))

train_lens = []
targets_b = [lead_b, position_b, evidence_b, claim_b, conclusion_b, counterclaim_b, rebuttal_b]
targets_i = [lead_i, position_i, evidence_i, claim_i, conclusion_i, counterclaim_i, rebuttal_i]
target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4, 'Counterclaim':5, 'Rebuttal':6}

for id_num in range(len(IDS)):
    if LOAD_TOKENS_FROM: break
    if id_num % 100 == 0: print(id_num,', ',end='')
    n = IDS[id_num]
    name = f'./train/{n}.txt'
    txt = open(name, 'r').read()
    train_lens.append( len(txt.split()))
    tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length', truncation=True, return_offsets_mapping=True)
    train_tokens[id_num,] = tokens['input_ids']
    train_attention[id_num,] = tokens['attention_mask']
    offsets = tokens['offset_mapping']
    offset_index = 0
    df = train.loc[train.id==n]
    for index,row in df.iterrows():
        a = row.discourse_start
        b = row.discourse_end
        if offset_index>len(offsets)-1:
            break
        c = offsets[offset_index][0]
        d = offsets[offset_index][1]
        beginning = True
        while b>c:
            if (c>=a)&(b>=d):
                k = target_map[row.discourse_type]
                if beginning:
                    targets_b[k][id_num][offset_index] = 1
                    beginning = False
                else:
                    targets_i[k][id_num][offset_index] = 1
            offset_index += 1
            if offset_index>len(offsets)-1:
                break
            c = offsets[offset_index][0]
            d = offsets[offset_index][1]

if LOAD_TOKENS_FROM is None:
    targets = np.zeros((len(IDS), MAX_LEN, 15), dtype = 'int32')
    for k in range(7):
        targets[:, :, 2 * k] = targets_b[k]
        targets[:, :, 2 * k + 1] = targets_i[k]
    targets[:, :, 14] = 1 - np.max(targets, axis = -1)
# if LOAD_TOKENS_FROM is None:
#     np.save(f'targets_{MAX_LEN}', targets)
#     np.save(f'tokens_{MAX_LEN}', train_tokens)
#     np.save(f'attention_{MAX_LEN}', train_attention)
#     print('Saved NER tokens')
# else:
#     targets = np.load(f'{LOAD_TOKENS_FROM}/targets_{MAX_LEN}.npy')
#     train_tokens = np.load(f'{LOAD_TOKENS_FROM}/tokens_{MAX_LEN}.npy')
#     train_attention = np.load(f'{LOAD_TOKENS_FROM}/attention_{MAX_LEN}.npy')
#     print('Loaded NER tokens')

0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 2300 , 2400 , 2500 , 2600 , 2700 , 2800 , 2900 , 3000 , 3100 , 3200 , 3300 , 3400 , 3500 , 3600 , 3700 , 3800 , 3900 , 4000 , 4100 , 4200 , 4300 , 4400 , 4500 , 4600 , 4700 , 4800 , 4900 , 5000 , 5100 , 5200 , 5300 , 5400 , 5500 , 5600 , 5700 , 5800 , 5900 , 6000 , 6100 , 6200 , 6300 , 6400 , 6500 , 6600 , 6700 , 6800 , 6900 , 7000 , 7100 , 7200 , 7300 , 7400 , 7500 , 7600 , 7700 , 7800 , 7900 , 8000 , 8100 , 8200 , 8300 , 8400 , 8500 , 8600 , 8700 , 8800 , 8900 , 9000 , 9100 , 9200 , 9300 , 9400 , 9500 , 9600 , 9700 , 9800 , 9900 , 10000 , 10100 , 10200 , 10300 , 10400 , 10500 , 10600 , 10700 , 10800 , 10900 , 11000 , 11100 , 11200 , 11300 , 11400 , 11500 , 11600 , 11700 , 11800 , 11900 , 12000 , 12100 , 12200 , 12300 , 12400 , 12500 , 12600 , 12700 , 12800 , 12900 , 13000 , 13100 , 13200 , 13300 , 13400 , 13500 , 13600 , 13700 , 13800 , 

In [15]:
targets[0][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

In [None]:
target_map_rev = {0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement', 5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'}

def get_preds(dataset = 'train', verbose = True, text_ids = None, preds = None):
    all_predictions = []
    for id_num in range(len(preds)):
        if (id_num % 100 == 0) & (verbose): print(id_num, ', ', end = '')
        n = text_ids[id_num]
        name = f'../input/feedback-prize-2021/{dataset}/{n}.txt'
        txt = open(name, 'r').read()
        tokens = tokenizer.encode_plus(txt, max_length = MAX_LEN, padding = 'max_length', truncation = True, return_offsets_mapping = True)
        off = tokens['offset_mapping']
        w = []
        blank = True
        for i in range(len(txt)):
            if (txt[i] != ' ') & (txt[i] != '\n') & (blank == True):
                w.append(i)
                blank = False
            elif (txt[i] == ' ') | (txt[i] == '\n'):
                blank = True
        w.append(1e6)
        word_map = -1 * np.ones(MAX_LEN, dtype = 'int32')
        w_i = 0
        for i in range(len(off)):
            if off[i][1] == 0: continue
            while off[i][0] >= w[w_i + 1]: w_i += 1
            word_map[i] = int(w_i)
        pred = preds[id_num,] / 2.0
        i = 0
        while i < MAX_LEN:
            prediction = []
            start = pred[i]
            if start in [0, 1, 2, 3, 4, 5, 6, 7]:
                prediction.append(word_map[i])
                i += 1
                if i >= MAX_LEN: break
                while pred[i] == start + 0.5:
                    if not word_map[i] in prediction: prediction.append(word_map[i])
                    i += 1
                    if i >= MAX_LEN: break
            else: i += 1
            prediction = [x for x in prediction if x != -1]
            if len(prediction) > 4: all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))

    # MAKE DATAFRAME
    df = pd.DataFrame(all_predictions)
    df.columns = ['id', 'class', 'predictionstring']
    return df

def calc_overlap(row):
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]

def score_feedback_comp(pred_df, gt_df):
    gt_df = gt_df[['id', 'discourse_type', 'predictionstring']].reset_index(drop = True).copy()
    pred_df = pred_df[['id', 'class', 'predictionstring']].reset_index(drop = True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    joined = pred_df.merge(gt_df, left_on = ['id', 'class'], right_on = ['id', 'discourse_type'], how = 'outer', suffixes = ('_pred', '_gt'))
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')
    joined['overlaps'] = joined.apply(calc_overlap, axis=1)
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])
    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP')         .sort_values('max_overlap', ascending=False)         .groupby(['id','predictionstring_gt']).first()['pred_id'].values
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]
    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score