In [1]:
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel

pd.set_option("display.max_rows", 300)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

## noise analysis

In [14]:
trn_df = pd.read_csv('../inputs/origin/train.csv').dropna()
display(trn_df.shape, trn_df.head())

(27480, 4)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [7]:
ckpts = []
for fckpt in glob('../checkpoints/e150/best/*.pth'):
    ckpts.append(torch.load(fckpt))

In [8]:
def _mk_char_preds(offsets, preds_head, preds_tail):
    char_preds_heads, char_preds_tails = [], []
    # 最初の４つは無視
    for offset, pred_head, pred_tail in tqdm(zip(offsets, preds_head, preds_tail)):
        char_preds_head, char_preds_tail = np.zeros(141), np.zeros(141)
        for offset_i, pred_head_i, pred_tail_i in zip(offset[4:], pred_head[4:], pred_tail[4:]):
            char_preds_head[offset_i[0]:offset_i[1]] = pred_head_i
            char_preds_tail[offset_i[0]:offset_i[1]] = pred_tail_i
        char_preds_heads.append(char_preds_head)
        char_preds_tails.append(char_preds_tail)
    char_preds_heads, char_preds_tails = np.asarray(char_preds_heads), np.asarray(char_preds_tails)
    return char_preds_heads, char_preds_tails

In [9]:
ckpts[0].keys()

dict_keys(['fold_num', 'current_epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'val_textIDs', 'val_input_ids', 'val_preds', 'val_labels', 'val_offsets', 'histories'])

In [15]:
char_preds = []

for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    val_char_preds_heads, val_char_preds_tails = _mk_char_preds(ckpt['val_offsets'], ckpt['val_preds'][0],  ckpt['val_preds'][1])
    for textID, val_char_preds_head, val_char_preds_tail in zip(ckpt['val_textIDs'], val_char_preds_heads, val_char_preds_tails):
        char_preds.append({'textID': textID, 'char_pred_start': val_char_preds_head, 'char_pred_end': val_char_preds_tail})
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_head'] = val_char_preds_heads
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_tail'] = val_char_preds_tails
    trn_df = trn_df.reset_index()
trn_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [16]:
char_preds_df = pd.DataFrame(char_preds)
char_preds_df.head()

Unnamed: 0,textID,char_pred_start,char_pred_end
0,cb774db0d1,"[0.9995718598365784, 0.9995718598365784, 3.184...","[4.055531098856591e-05, 4.055531098856591e-05,..."
1,549e992a42,"[0.5854580998420715, 0.5854580998420715, 0.585...","[0.000700252887327224, 0.000700252887327224, 0..."
2,088c60f138,"[0.4091542363166809, 0.4091542363166809, 0.409...","[0.0003013552923221141, 0.0003013552923221141,..."
3,9642c003ef,"[0.09714755415916443, 0.09714755415916443, 0.0...","[0.00011253724369453266, 0.0001125372436945326..."
4,358bd9e861,"[0.44711464643478394, 0.44711464643478394, 0.4...","[0.0007916235481388867, 0.0007916235481388867,..."


In [23]:
trn_df = trn_df.merge(char_preds_df, on='textID', how='left')
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,char_pred_start,char_pred_end
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[0.9995718598365784, 0.9995718598365784, 3.184...","[4.055531098856591e-05, 4.055531098856591e-05,..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[0.5854580998420715, 0.5854580998420715, 0.585...","[0.000700252887327224, 0.000700252887327224, 0..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[0.4091542363166809, 0.4091542363166809, 0.409...","[0.0003013552923221141, 0.0003013552923221141,..."
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[0.09714755415916443, 0.09714755415916443, 0.0...","[0.00011253724369453266, 0.0001125372436945326..."
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[0.44711464643478394, 0.44711464643478394, 0.4...","[0.0007916235481388867, 0.0007916235481388867,..."
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[0.055910319089889526, 0.055910319089889526, 0...","[0.0026183289010077715, 0.0026183289010077715,..."
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[0.2434491068124771, 0.2434491068124771, 0.000...","[0.0005515196826308966, 0.0005515196826308966,..."
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[0.35198235511779785, 0.35198235511779785, 0.0...","[0.0006619470659643412, 0.0006619470659643412,..."
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[0.2040465772151947, 0.2040465772151947, 0.204...","[0.0006623523076996207, 0.0006623523076996207,..."


In [25]:
trn_df.isnull().mean()

textID             0.0
text               0.0
selected_text      0.0
sentiment          0.0
char_pred_start    0.0
char_pred_end      0.0
dtype: float64

In [19]:
!mkdir -p ../inputs/datasets/char_preds/e150

In [27]:
trn_df[['textID', 'char_pred_start', 'char_pred_end']].to_pickle('../inputs/datasets/char_preds/e150/e150_char_preds.pkl')