In [22]:
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel

pd.set_option("display.max_rows", 300)

## e062 の valid を確認する

In [2]:
trn_df = pd.read_csv('../inputs/origin/train.csv')
display(trn_df.shape, trn_df.head())

(27481, 4)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
trn_df = trn_df[trn_df.selected_text.notnull()]
trn_df.shape

(27480, 4)

In [4]:
ckpts = []
for fckpt in glob('../checkpoints/e062/best/*'):
    ckpts.append(torch.load(fckpt))

In [5]:
ckpts[0].keys()

dict_keys(['fold_num', 'current_epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'val_textIDs', 'val_input_ids', 'val_preds', 'val_labels', 'histories'])

In [6]:
def _get_predicted_texts(input_ids, y_preds_head, y_preds_tail, tokenizer):
    predicted_texts = []
    for input_id, y_pred_head, y_pred_tail \
            in zip(input_ids, y_preds_head, y_preds_tail):
        pred_label_head = y_pred_head.argmax()
        pred_label_tail = y_pred_tail.argmax()
        predicted_text = tokenizer.decode(
            input_id[pred_label_head:pred_label_tail])
        predicted_texts.append(predicted_text)

    return predicted_texts

In [7]:
def _get_selected_texts(input_ids, labels_head, labels_tail, tokenizer):
    predicted_texts = []
    for input_id, pred_label_head, pred_label_tail \
            in zip(input_ids, labels_head, labels_tail):
        predicted_text = tokenizer.decode(
            input_id[pred_label_head:pred_label_tail])
        predicted_texts.append(predicted_text)

    return predicted_texts

In [8]:
import sys
sys.path.append('../')
from tools.tokenizers import myRobertaByteLevelBPETokenizer

tokenizer = myRobertaByteLevelBPETokenizer(   
    vocab_file='../inputs/datasets/roberta/tokenizer/vocab.json',
    merges_file='../inputs/datasets/roberta/tokenizer/merges.txt',
    lowercase=True,
    add_prefix_space=True)

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    predicted_texts = _get_predicted_texts(ckpt['val_input_ids'], ckpt['val_preds'][0],  ckpt['val_preds'][1], tokenizer)
    trn_df.loc[ckpt['val_textIDs'], 'predicted_texts'] = predicted_texts
    trn_df = trn_df.reset_index()
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"****,"
...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost his job and can`t afford it
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,good
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth


In [10]:
for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    manual_selected_texts = _get_selected_texts(ckpt['val_input_ids'], ckpt['val_labels'][0],  ckpt['val_labels'][1], tokenizer)
    trn_df.loc[ckpt['val_textIDs'], 'manual_selected_text'] = manual_selected_texts
    trn_df = trn_df.reset_index()
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going","i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me...,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"****,","sons of ****,"
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost his job and can`t afford it,husband lost
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered,", don`t force"
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,good,yay good for both of you.
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,but it was worth it ****.


In [11]:
trn_df.query('textID == "6cd1ba8680"')

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text
19561,6cd1ba8680,Thats charmingly funny,ats charmingly fun,positive,thats charmingly funny,thats charmingly funny


In [12]:
trn_df[trn_df.manual_selected_text.str.contains('</s>')]

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text


In [13]:
trn_df['selected_text_lower'] = trn_df['selected_text'].apply(lambda x: " " + " ".join(x.lower().split()))
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,selected_text_lower
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going","i`d have responded, if i were going","i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad,sooo sad,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me...,bullying me,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,leave me alone,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"****,","sons of ****,","sons of ****,"
...,...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost his job and can`t afford it,husband lost,d lost
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered,", don`t force",", don`t force"
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,good,yay good for both of you.,yay good for both of you.
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,but it was worth it ****.,but it was worth it ****.


In [14]:
(trn_df.selected_text_lower == trn_df.manual_selected_text).mean()

0.9046943231441048

In [17]:
(trn_df.selected_text_lower == trn_df.manual_selected_text).mean()

0.9031659388646288

In [15]:
trn_df[(trn_df.selected_text_lower != trn_df.manual_selected_text)].query('sentiment != "neutral"').head(100)

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,selected_text_lower
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,miss,gonna,onna
27,bdc32ea43c,On the way to Malaysia...no internet access to...,.no internet,negative,no internet access to twit,...no internet,.no internet
32,1c31703aef,If it is any consolation I got my BMI tested ...,well so much for being unhappy for about 10 mi...,negative,unhappy,well so much for being unhappy for about 10 m...,well so much for being unhappy for about 10 m...
39,2863f435bd,A little happy for the wine jeje ok it`sm my f...,A little happy fo,positive,love this day,a little happy for,a little happy fo
48,3d9d4b0b55,"i donbt like to peel prawns, i also dont like ...",dont like go,negative,dont like,dont like going,dont like go
...,...,...,...,...,...,...,...
1061,7048b3a9c0,"Nako! Umuulan pa naman! Anyway, enjoy the bi...",", enjoy",positive,enjoy,", enjoy",", enjoy"
1075,7a3840a246,Don`t follow your dreams; chase them.- Richard...,who I think is very smart,positive,smart,who i think is very smart,who i think is very smart
1076,3d5c1ed21b,Up is out? I didn`t get the memo It looks ...,o It looks amazi,positive,amazing.,memo it looks amazing,o it looks amazi
1082,e9c337f756,@_TWEE haha thanks to you that`s my new word f...,thanks to you th,positive,thanks,thanks to you that,thanks to you th


In [16]:
trn_df.shape, trn_df[(trn_df.selected_text_lower != trn_df.manual_selected_text)].shape

((27480, 7), (2619, 7))

In [17]:
trn_df['manual_and_selected_intersection_len'] = trn_df.apply(lambda row: len(set(row['manual_selected_text'].split()) & set(row['selected_text_lower'].split())), axis=1)
trn_df['manual_and_selected_intersection_len'].value_counts().sort_index()

0      334
1     7211
2     2313
3     2163
4     1983
5     1603
6     1380
7     1176
8     1012
9      879
10     812
11     732
12     669
13     578
14     558
15     504
16     470
17     454
18     435
19     395
20     414
21     335
22     317
23     289
24     211
25     114
26      77
27      40
28      14
29       8
Name: manual_and_selected_intersection_len, dtype: int64

In [18]:
trn_df['manual_and_selected_tokenized_intersection_len'] = trn_df.apply(lambda row: len(set(tokenizer.encode(' '.join(row['manual_selected_text'].split()), add_special_tokens=False).ids) & set(tokenizer.encode(' '.join(row['selected_text_lower'].split()), add_special_tokens=False).ids)), axis=1)
trn_df['manual_and_selected_tokenized_intersection_len'].value_counts().sort_index()

0       66
1     5172
2     2825
3     1859
4     1746
5     1628
6     1335
7     1152
8     1027
9      961
10     800
11     701
12     684
13     649
14     584
15     502
16     514
17     477
18     445
19     452
20     380
21     361
22     375
23     354
24     368
25     317
26     307
27     278
28     243
29     238
30     165
31     128
32     134
33      88
34      56
35      29
36      25
37      16
38      15
39       8
40       8
41       5
42       3
Name: manual_and_selected_tokenized_intersection_len, dtype: int64

In [23]:
trn_df.query('manual_and_selected_tokenized_intersection_len == 0').head(100)

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,selected_text_lower,manual_and_selected_intersection_len,manual_and_selected_tokenized_intersection_len
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,miss,gonna,onna,0,0
309,a54d3c2825,"I know It was worth a shot, though!",as wort,positive,worth,was worth,as wort,0,0
491,ee9df322d1,"Sorry, we`ll try to keep it down.",Sorr,negative,"sorry,",sorry,sorr,0,0
808,31fa81e0ae,"Yes, you should go see Star Trek! It`s sooo...",ch fu,positive,it`s sooooo much fun!,much fun,ch fu,0,0
1727,795de6c2f5,ooh ouch! lovely weather but ****! cover up...,lovel,positive,lovely,lovely,lovel,0,0
1797,c4ffb519a5,I don`t feel any pressure right now... Happ...,Happ,positive,happy,happy,happ,0,0
2597,95593ecedb,"Cool Sound!! Luv 'Drama', 'Love Game' doesn`t...",ish Go,positive,cool,wish good,ish go,0,0
2786,acfbac049f,Sure! My entire blogroll is terribly updat...,uld motiva,positive,motivate,could motivate,uld motiva,0,0
3324,ee5eb5337b,Heading to the gym. The group of guys that US...,ly sa,negative,makes me terribly sad,terribly sad,ly sa,0,0
3503,d5b33ff5f4,it was just true and you do cause me to hav...,ng dirt,negative,dirty thoughts,having dirty,ng dirt,0,0


In [25]:
trn_df.to_csv('../inputs/nes_info/v3_dataset_trn_df.csv', index=False)

## debug

In [27]:
row = trn_df.query('textID == "6cd1ba8680"').iloc[0]

In [36]:
text = " " + " ".join(row['text'].split())
text_output = tokenizer.encode_plus(
    text=text,
    # text_pair=None,
    # text_pair=f"[{row['sentiment']}]",
    text_pair=row['sentiment'],
    add_special_tokens=True,
    max_length=125,
    pad_to_max_length=True,
    return_tensor='pt',
    return_token_type_ids=False,
    return_attention_mask=True,
    return_special_tokens_mask=True,
)
row['input_ids'] = text_output['input_ids']
row['attention_mask'] = text_output['attention_mask']
row['special_tokens_mask'] = text_output['special_tokens_mask']

if 'selected_text' not in row:
    row['selected_text'] = ''
    row['labels_head'] = -1
    row['labels_tail'] = -1
#    return row

text = " " + " ".join(row['selected_text'].split())
selected_text_output = tokenizer.encode_plus(
    text=text,
    text_pair=None,
    add_special_tokens=False,
    max_length=125,
    pad_to_max_length=False,
    return_tensor='pt',
    return_token_type_ids=False,
    return_attention_mask=False,
)

# allign labels for segmentation
input_ids = text_output['input_ids']
sel_input_ids = selected_text_output['input_ids']
# 1 start なのは、先頭の token をスルーするため
matched_cnt = len([i for i in input_ids[1:1 + len(sel_input_ids)]
                   if i in sel_input_ids])
best_matched_cnt = matched_cnt
best_matched_i = 1
# for i in range(0, len(input_ids)):
# 1 start なのは、先頭の token をスルーするため
for i in range(1, len(input_ids) - len(sel_input_ids)):
    head_input_id_i = input_ids[i]
    tail_input_id_i = input_ids[i + len(sel_input_ids)]
    if head_input_id_i in sel_input_ids:
        matched_cnt -= 1
    if tail_input_id_i in sel_input_ids:
        matched_cnt += 1
    if matched_cnt < 0:
        raise Exception('invalid logic')

    if best_matched_cnt < matched_cnt:
        best_matched_cnt = matched_cnt
        best_matched_i = i + 1   # 抜いた時の話なので
    if best_matched_cnt == len(sel_input_ids):
        break

row['labels_head'] = best_matched_i
row['labels_tail'] = best_matched_i + len(sel_input_ids)

In [44]:
row['text']

'  Thats charmingly funny'

In [45]:
row['selected_text']

'ats charmingly fun'

In [39]:
selected_text_output['input_ids']

[23, 29, 18452, 352, 1531]

In [40]:
" " + " ".join(row['selected_text'].split())

' ats charmingly fun'

In [42]:
tokenizer.encode(" " + " ".join(row['selected_text'].split())).ids

[23, 29, 18452, 352, 1531]

In [43]:
tokenizer.encode(" " + " ".join(row['text'].split())).ids

[45365, 18452, 352, 6269]

In [None]:
tokenizer.decode()

## tokenizer 確認

In [17]:
import sys
sys.path.append('../')
from tools.tokenizers import myRobertaByteLevelBPETokenizer

my_tokenizer = myRobertaByteLevelBPETokenizer(   
    vocab_file='../inputs/datasets/roberta/tokenizer/vocab.json',
    merges_file='../inputs/datasets/roberta/tokenizer/merges.txt',
    lowercase=True,
    add_prefix_space=True)

In [21]:
my_tokenizer.decode(ckpt['val_input_ids'][0][1:11])

' i`d have responded, if i were going'

In [52]:
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [53]:
temp_jac = 0
for i, row in trn_df.iterrows():
    temp_jac += jaccard(row['selected_text'], row['predicted_texts'])

In [54]:
temp_jac / len(trn_df)

0.44581498228808364

## 改善の必要あり
 - decode すると違う形式になってしまう
 - selected_text を encode -> decode して形式が変わるパターンを除去する後処理が必要そう

In [75]:
trn_df.loc[0]

textID                                                         cb774db0d1
text                                  I`d have responded, if I were going
selected_text                         I`d have responded, if I were going
sentiment                                                         neutral
predicted_texts         i ` d have responded, if i were going [SEP] [PAD]
manual_selected_text    i ` d have responded, if i were going [SEP] [PAD]
Name: 0, dtype: object

In [74]:
jaccard(trn_df.loc[0]['selected_text'], trn_df.loc[0]['predicted_texts'][:-12])

0.6666666666666666

In [72]:
trn_df.loc[0]['selected_text'][4:], trn_df.loc[0]['predicted_texts'][6:-12]

('have responded, if I were going', 'have responded, if i were going')