In [240]:
import gc

from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel, RobertaTokenizer

pd.set_option('display.max_rows', 600)

## e005 の valid を確認する

In [207]:
trn_df = pd.read_csv('../inputs/origin/train.csv')
trn_df = trn_df[trn_df.selected_text.notnull()]
display(trn_df.shape, trn_df.head())

(27480, 4)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [178]:
ls ../checkpoints/e007/best_for_analysis/

epoch_2_1.85479_-1.00000_0.71094_checkpoint.pth
epoch_2_1.85522_-1.00000_0.70647_checkpoint.pth
epoch_3_1.85356_-1.00000_0.70685_checkpoint.pth
epoch_3_1.88161_-1.00000_0.70257_checkpoint.pth
epoch_3_1.89511_-1.00000_0.70562_checkpoint.pth


In [199]:
from glob import glob

pths = glob('../checkpoints/e007/best_for_analysis/*')
# pths = glob('../checkpoints/e007/best/*')

In [200]:
ckpts = {}

for i, filepath in enumerate(pths):
    ckpts[i] = torch.load(filepath)

In [201]:
ckpts[0].keys()

dict_keys(['fold_num', 'current_epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'val_textIDs', 'val_input_ids', 'val_preds', 'val_labels', 'histories'])

In [202]:
def _get_predicted_texts(input_ids, y_preds_head, y_preds_tail, tokenizer):
    predicted_texts = []
    for input_id, y_pred_head, y_pred_tail \
            in zip(input_ids, y_preds_head, y_preds_tail):
        pred_label_head = y_pred_head.argmax()
        pred_label_tail = y_pred_tail.argmax()
        predicted_text = tokenizer.decode(
            input_id[pred_label_head:pred_label_tail])
        predicted_texts.append(predicted_text)

    return predicted_texts

In [203]:
def _get_selected_texts(input_ids, labels_head, labels_tail, tokenizer):
    predicted_texts = []
    for input_id, pred_label_head, pred_label_tail \
            in zip(input_ids, labels_head, labels_tail):
        predicted_text = tokenizer.decode(
            input_id[pred_label_head:pred_label_tail])
        predicted_texts.append(predicted_text)

    return predicted_texts

In [204]:
def _get_unknown_nums(input_ids, labels_head, labels_tail, tokenizer):
    unknown_nums = []
    for input_id, pred_label_head, pred_label_tail \
            in zip(input_ids, labels_head, labels_tail):
        unknown_nums.append((np.asarray(input_id) == 3).sum())

    return unknown_nums

In [205]:
import sys
sys.path.append('../')
from tools.tokenizers import myRobertaByteLevelBPETokenizer

tokenizer = myRobertaByteLevelBPETokenizer(   
    vocab_file='../inputs/datasets/roberta/tokenizer/vocab.json',
    merges_file='../inputs/datasets/roberta/tokenizer/merges.txt',
    lowercase=True,
    add_prefix_space=True)

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [208]:
trn_df = trn_df.set_index('textID')

for ckpt_key in ckpts:
    ckpt = ckpts[ckpt_key]
    predicted_texts = _get_predicted_texts(ckpt['val_input_ids'], ckpt['val_preds'][0],  ckpt['val_preds'][1], tokenizer)
    trn_df.loc[ckpt['val_textIDs'], 'predicted_texts'] = predicted_texts

trn_df = trn_df.reset_index()

trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****,"
...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth it ****.


In [209]:
trn_df = trn_df.set_index('textID')

for ckpt_key in ckpts:
    ckpt = ckpts[ckpt_key]
    manual_selected_texts = _get_selected_texts(ckpt['val_input_ids'], ckpt['val_labels'][0],  ckpt['val_labels'][1], tokenizer)
    trn_df.loc[ckpt['val_textIDs'], 'manual_selected_text'] = manual_selected_texts
    
trn_df = trn_df.reset_index()

trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going","i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying,bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****,","sons of ****,"
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost,husband lost
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered,", don`t force"
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good,yay good for both of you.
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth it ****.,but it was worth it ****.


In [210]:
trn_df = trn_df.set_index('textID')

for ckpt_key in ckpts:
    ckpt = ckpts[ckpt_key]
    unknown_nums = _get_unknown_nums(ckpt['val_input_ids'], ckpt['val_labels'][0],  ckpt['val_labels'][1], tokenizer)
    trn_df.loc[ckpt['val_textIDs'], 'unknown_nums'] = unknown_nums
    
trn_df = trn_df.reset_index()

trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,unknown_nums
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going","i`d have responded, if i were going",0.0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad,sooo sad,0.0
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying,bullying me,0.0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,leave me alone,0.0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****,","sons of ****,",0.0
...,...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost,husband lost,0.0
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered,", don`t force",0.0
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good,yay good for both of you.,0.0
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth it ****.,but it was worth it ****.,0.0


In [211]:
# no unkonwn
trn_df.unknown_nums.value_counts()

0.0    27480
Name: unknown_nums, dtype: int64

In [212]:
print(trn_df['selected_text'].isnull().sum())
trn_df['selected_text'] = trn_df['selected_text'].fillna('')
print(trn_df['selected_text'].isnull().sum())

trn_df['selected_text_lower'] = trn_df.selected_text.apply(lambda x: ' ' + x.lower())

0
0


In [213]:
trn_df['selected_text_lower'] = trn_df['selected_text_lower'].apply(lambda x: ' ' + ' '.join(x.split()))
trn_df = trn_df[trn_df['manual_selected_text'].notnull()]
trn_df['manual_selected_text'] = trn_df['manual_selected_text'].apply(lambda x: ' ' + ' '.join(x.split()))

In [214]:
trn_df['manual_equal_selected'] = (trn_df['manual_selected_text'] == trn_df['selected_text_lower']).values
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,unknown_nums,selected_text_lower,manual_equal_selected
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going","i`d have responded, if i were going",0.0,"i`d have responded, if i were going",True
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad,sooo sad,0.0,sooo sad,True
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying,bullying me,0.0,bullying me,True
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,leave me alone,0.0,leave me alone,True
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****,","sons of ****,",0.0,"sons of ****,",True
...,...,...,...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost,husband lost,0.0,d lost,False
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered,", don`t force",0.0,", don`t force",True
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good,yay good for both of you.,0.0,yay good for both of you.,True
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth it ****.,but it was worth it ****.,0.0,but it was worth it ****.,True


In [215]:
trn_df.manual_equal_selected.value_counts()

True     24830
False     2650
Name: manual_equal_selected, dtype: int64

In [249]:
trn_df['num_intersection'] = trn_df.apply(lambda row: len(set(row['manual_selected_text'].split()) & set(row['selected_text_lower'].split())), axis=1)

In [250]:
trn_df['num_intersection'].value_counts().sort_index()

0      500
1     7126
2     2331
3     2134
4     1968
5     1592
6     1367
7     1171
8     1002
9      875
10     815
11     732
12     661
13     584
14     553
15     507
16     464
17     457
18     430
19     394
20     417
21     331
22     316
23     290
24     210
25     114
26      77
27      40
28      14
29       8
Name: num_intersection, dtype: int64

In [241]:
trn_df.query('not manual_equal_selected')[['text', 'sentiment', 'manual_selected_text', 'selected_text_lower', 'predicted_texts']]

Unnamed: 0,text,sentiment,manual_selected_text,selected_text_lower,predicted_texts
18,is back home now gonna miss every one,negative,<s> is,onna,miss
27,On the way to Malaysia...no internet access to...,negative,...no internet,.no internet,no internet access to twit
32,If it is any consolation I got my BMI tested ...,negative,ed well so much for being unhappy for about 10,well so much for being unhappy for about 10 m...,unhappy
39,A little happy for the wine jeje ok it`sm my f...,positive,<s> a little happy,a little happy fo,happy
48,"i donbt like to peel prawns, i also dont like ...",negative,also dont like,dont like go,i donbt like
...,...,...,...,...,...
27455,i wanna leave work already! Not feelin it 2day,negative,i wanna leave work,wanna leave work al,not feelin it 2day
27469,lol i know and haha..did you fall asleep?? o...,negative,get bored,t bored,bored
27473,So I get up early and I feel good about the da...,positive,and i feel good,i feel good ab,good
27475,wish we could come see u on Denver husband l...,negative,husband lost,d lost,lost


In [218]:
del ckpts
gc.collect()

40

## パターンをしぼりだす
 - 原因
     - 
 - 影響
     - 合う部分が一つもなくて、default の best_matched_i = 0 が使われている

#### <s\>, </s\>  を含む場合 (これはどうにか除去しないと...)
 - なんでこうなってる？
 - 場当たり的に対応するのは可能？

In [219]:
trn_df[trn_df['manual_selected_text'].str.contains('<s>').fillna(False)][['text', 'sentiment', 'manual_selected_text', 'selected_text_lower', 'predicted_texts']]

Unnamed: 0,text,sentiment,manual_selected_text,selected_text_lower,predicted_texts
18,is back home now gonna miss every one,negative,<s> is,onna,miss
39,A little happy for the wine jeje ok it`sm my f...,positive,<s> a little happy,a little happy fo,happy
134,Nice to see you tweeting! It`s Sunday 10th M...,positive,<s> nice,e nice,nice
168,Few Bevvies 2day in twn..great on a day off!!,positive,<s>,great,great
189,lost my tooth 2day whilst i was eating gum...oww,negative,<s> lost,oww,lost
247,I`m not sleeping at all until accepts my appo...,negative,<s> i`m not sleeping at all,i`m not sleeping at all un,i`m not sleeping at all
297,Im not bannished... but I am at work till 6,negative,<s> im not bannished,im not bannished.,im not bannished
309,"I know It was worth a shot, though!",positive,<s> i know,as wort,worth
348,"90 degrees, gross skies, and thunderstorms...p...",positive,<s> 90,perfect ma,perfect
361,Please Review Sunehre Ad Placement http://tin...,positive,<s> please,please re,please review


In [238]:
trn_df.loc[4046].selected_text

'Unfortunately'

In [237]:
trn_df.loc[4046].text

'Well on a normal day I`d already be done with work...Unfortunately this is not a normal day. Which means I`ll be in the office till late.'

In [226]:
(trn_df.predicted_texts == '').sum()

180

In [220]:
trn_df[trn_df['manual_selected_text'].str.contains('</s>').fillna(False)]

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,unknown_nums,selected_text_lower,manual_equal_selected
7441,d12148ecc8,I need a wardrobe intervention,I need a wardrobe interventio,neutral,i need a wardrobe intervention,<s> i need a wardrobe intervention</s>,0.0,i need a wardrobe interventio,False
7836,b30b442df8,invite sent You can now invite more collabs...,invite sent You can now invite more collabs ...,neutral,invite sent you can now invite more collabs t...,<s> invite sent you can now invite more colla...,0.0,invite sent you can now invite more collabs t...,False
15164,feb7ba9cc1,hehe i will never thorw out these shoes i`m ...,hehe i will never thorw out these shoes i`m l...,positive,awesome,<s> hehe i will never thorw out these shoes i...,0.0,hehe i will never thorw out these shoes i`m l...,False


In [221]:
# ちなみに predicted では一つもない
trn_df[trn_df['predicted_texts'].str.contains('<s>').fillna(False)]

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,unknown_nums,selected_text_lower,manual_equal_selected


In [222]:
# ちなみに predicted では一つもない
trn_df[trn_df['predicted_texts'].str.contains('</s>').fillna(False)]

Unnamed: 0,textID,text,selected_text,sentiment,predicted_texts,manual_selected_text,unknown_nums,selected_text_lower,manual_equal_selected


In [227]:
# ちなみに、空文字 predict が結構ある
(trn_df.predicted_texts == '').sum()

180

In [223]:
tokenizer.encode(' <s> sorry').ids

[28696, 29, 15698, 6661]

In [91]:
tokenizer.encode(' sorr').ids

[22696, 338]

In [101]:
tokenizer.encode(' sorry').tokens

['Ġsorry']

In [229]:
tokenizer.encode('word instead...and rsvp...sorry i cant come t').tokens

['Ġword',
 'Ġinstead',
 '...',
 'and',
 'Ġr',
 'sv',
 'p',
 '...',
 'sorry',
 'Ġi',
 'Ġcant',
 'Ġcome',
 'Ġt']

In [232]:
tokenizer.encode('Well on a normal day I`d already be done with work...Unfortunately this is not a normal day. Which means I`ll be in the office till late.').tokens

['Ġwell',
 'Ġon',
 'Ġa',
 'Ġnormal',
 'Ġday',
 'Ġi',
 '`',
 'd',
 'Ġalready',
 'Ġbe',
 'Ġdone',
 'Ġwith',
 'Ġwork',
 '...',
 'un',
 'fortunately',
 'Ġthis',
 'Ġis',
 'Ġnot',
 'Ġa',
 'Ġnormal',
 'Ġday',
 '.',
 'Ġwhich',
 'Ġmeans',
 'Ġi',
 '`',
 'll',
 'Ġbe',
 'Ġin',
 'Ġthe',
 'Ġoffice',
 'Ġtill',
 'Ġlate',
 '.']

In [234]:
'Well on a normal day I`d already be done with work...Unfortunately this is not a normal day. Which means I`ll be in the office till late.'.split()

['Well',
 'on',
 'a',
 'normal',
 'day',
 'I`d',
 'already',
 'be',
 'done',
 'with',
 'work...Unfortunately',
 'this',
 'is',
 'not',
 'a',
 'normal',
 'day.',
 'Which',
 'means',
 'I`ll',
 'be',
 'in',
 'the',
 'office',
 'till',
 'late.']

In [23]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [24]:
roberta_tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [26]:
roberta_tokenizer.encode('<unk>')

[0, 3, 2]

In [55]:
roberta_tokenizer.decode([2])

'</s>'