In [1]:
import re
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel

pd.set_option("display.max_rows", 300)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

## noise analysis

In [None]:
trn_df = pd.read_csv('../inputs/origin/train.csv').dropna()
display(trn_df.shape, trn_df.head())

In [None]:
ckpts = []
!mkdir -p ../checkpoints/e152/best_model_only/
for fckpt in glob('../checkpoints/e152/best/*.pth'):
    ckpt = torch.load(fckpt)
    ckpts.append(ckpt)
    fout_ckpt = {}
    fout_ckpt['model_state_dict'] = ckpt['model_state_dict']
    torch.save(fout_ckpt, re.sub('best', 'best_model_only', fckpt))

In [None]:
def _mk_char_preds(offsets, preds_head, preds_tail):
    char_preds_heads, char_preds_tails = [], []
    # 最初の４つは無視
    for offset, pred_head, pred_tail in tqdm(zip(offsets, preds_head, preds_tail)):
        char_preds_head, char_preds_tail = np.zeros(141), np.zeros(141)
        for offset_i, pred_head_i, pred_tail_i in zip(offset[4:], pred_head[4:], pred_tail[4:]):
            char_preds_head[offset_i[0]:offset_i[1]] = pred_head_i
            char_preds_tail[offset_i[0]:offset_i[1]] = pred_tail_i
        char_preds_heads.append(char_preds_head)
        char_preds_tails.append(char_preds_tail)
    char_preds_heads, char_preds_tails = np.asarray(char_preds_heads), np.asarray(char_preds_tails)
    return char_preds_heads, char_preds_tails

In [None]:
ckpts[0].keys()

In [None]:
char_preds = []

for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    val_char_preds_heads, val_char_preds_tails = _mk_char_preds(ckpt['val_offsets'], ckpt['val_preds'][0],  ckpt['val_preds'][1])
    for textID, val_char_preds_head, val_char_preds_tail in zip(ckpt['val_textIDs'], val_char_preds_heads, val_char_preds_tails):
        char_preds.append({'textID': textID, 'char_pred_start': val_char_preds_head, 'char_pred_end': val_char_preds_tail})
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_head'] = val_char_preds_heads
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_tail'] = val_char_preds_tails
    trn_df = trn_df.reset_index()
trn_df

In [28]:
char_preds_df = pd.DataFrame(char_preds)
char_preds_df.head()

Unnamed: 0,textID,char_pred_start,char_pred_end
0,cb774db0d1,"[0.9993091821670532, 0.9993091821670532, 0.000...","[0.00010164044942939654, 0.0001016404494293965..."
1,549e992a42,"[0.5705369114875793, 0.5705369114875793, 0.570...","[0.00019916410383302718, 0.0001991641038330271..."
2,088c60f138,"[0.44564589858055115, 0.44564589858055115, 0.4...","[0.00031484506325796247, 0.0003148450632579624..."
3,9642c003ef,"[0.1700505018234253, 0.1700505018234253, 0.170...","[0.00022137271298561245, 0.0002213727129856124..."
4,358bd9e861,"[0.7032945156097412, 0.7032945156097412, 0.703...","[0.0006509069353342056, 0.0006509069353342056,..."


In [29]:
trn_df = trn_df.merge(char_preds_df, on='textID', how='left')
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,char_pred_start,char_pred_end
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[0.9993091821670532, 0.9993091821670532, 0.000...","[0.00010164044942939654, 0.0001016404494293965..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[0.5705369114875793, 0.5705369114875793, 0.570...","[0.00019916410383302718, 0.0001991641038330271..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[0.44564589858055115, 0.44564589858055115, 0.4...","[0.00031484506325796247, 0.0003148450632579624..."
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[0.1700505018234253, 0.1700505018234253, 0.170...","[0.00022137271298561245, 0.0002213727129856124..."
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[0.7032945156097412, 0.7032945156097412, 0.703...","[0.0006509069353342056, 0.0006509069353342056,..."
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[0.03742819279432297, 0.03742819279432297, 0.0...","[0.00019678355602081865, 0.0001967835560208186..."
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[0.30980750918388367, 0.30980750918388367, 0.0...","[0.00034203799441456795, 0.0003420379944145679..."
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[0.5343130826950073, 0.5343130826950073, 0.003...","[0.0015588045353069901, 0.0015588045353069901,..."
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[0.19598905742168427, 0.19598905742168427, 0.1...","[0.0006552892737090588, 0.0006552892737090588,..."


In [30]:
trn_df.isnull().mean()

textID             0.0
text               0.0
selected_text      0.0
sentiment          0.0
char_pred_start    0.0
char_pred_end      0.0
dtype: float64

In [31]:
!mkdir -p ../inputs/datasets/char_preds/e152

In [32]:
trn_df[['textID', 'char_pred_start', 'char_pred_end']].to_pickle('../inputs/datasets/char_preds/e152/e152_char_preds.pkl')

## e153 も

## noise analysis

In [33]:
trn_df = pd.read_csv('../inputs/origin/train.csv').dropna()
display(trn_df.shape, trn_df.head())

(27480, 4)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [34]:
ckpts = []
!mkdir -p ../checkpoints/e153/best_model_only/
for fckpt in glob('../checkpoints/e153/best/*.pth'):
    ckpt = torch.load(fckpt)
    ckpts.append(ckpt)
    fout_ckpt = {}
    fout_ckpt['model_state_dict'] = ckpt['model_state_dict']
    torch.save(fout_ckpt, re.sub('best', 'best_model_only', fckpt))

In [35]:
def _mk_char_preds(offsets, preds_head, preds_tail):
    char_preds_heads, char_preds_tails = [], []
    # 最初の４つは無視
    for offset, pred_head, pred_tail in tqdm(zip(offsets, preds_head, preds_tail)):
        char_preds_head, char_preds_tail = np.zeros(141), np.zeros(141)
        for offset_i, pred_head_i, pred_tail_i in zip(offset[4:], pred_head[4:], pred_tail[4:]):
            char_preds_head[offset_i[0]:offset_i[1]] = pred_head_i
            char_preds_tail[offset_i[0]:offset_i[1]] = pred_tail_i
        char_preds_heads.append(char_preds_head)
        char_preds_tails.append(char_preds_tail)
    char_preds_heads, char_preds_tails = np.asarray(char_preds_heads), np.asarray(char_preds_tails)
    return char_preds_heads, char_preds_tails

In [36]:
ckpts[0].keys()

dict_keys(['fold_num', 'current_epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'val_textIDs', 'val_input_ids', 'val_preds', 'val_labels', 'val_offsets', 'histories'])

In [37]:
char_preds = []

for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    val_char_preds_heads, val_char_preds_tails = _mk_char_preds(ckpt['val_offsets'], ckpt['val_preds'][0],  ckpt['val_preds'][1])
    for textID, val_char_preds_head, val_char_preds_tail in zip(ckpt['val_textIDs'], val_char_preds_heads, val_char_preds_tails):
        char_preds.append({'textID': textID, 'char_pred_start': val_char_preds_head, 'char_pred_end': val_char_preds_tail})
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_head'] = val_char_preds_heads
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_tail'] = val_char_preds_tails
    trn_df = trn_df.reset_index()
trn_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [38]:
char_preds_df = pd.DataFrame(char_preds)
char_preds_df.head()

Unnamed: 0,textID,char_pred_start,char_pred_end
0,863edfea8b,"[0.3847193419933319, 0.3847193419933319, 0.384...","[0.0691322460770607, 0.0691322460770607, 0.069..."
1,a2e1121aa7,"[0.011290576308965683, 0.011290576308965683, 0...","[0.00036885947338305414, 0.0003688594733830541..."
2,4d3aca8926,"[0.9986562728881836, 0.9986562728881836, 0.998...","[0.00012245692778378725, 0.0001224569277837872..."
3,88a400760a,"[0.2519451975822449, 0.2519451975822449, 0.251...","[0.0004826546064577997, 0.0004826546064577997,..."
4,bbbd337b0b,"[0.4297378957271576, 0.4297378957271576, 0.429...","[0.056389953941106796, 0.056389953941106796, 0..."


In [39]:
trn_df = trn_df.merge(char_preds_df, on='textID', how='left')
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,char_pred_start,char_pred_end
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[0.9987541437149048, 0.9987541437149048, 0.000...","[3.688297510962002e-05, 3.688297510962002e-05,..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[0.615781307220459, 0.615781307220459, 0.61578...","[0.00027660999330691993, 0.0002766099933069199..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[0.3109663724899292, 0.3109663724899292, 0.310...","[0.0013144471449777484, 0.0013144471449777484,..."
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[0.2529185712337494, 0.2529185712337494, 0.252...","[0.0008793757879175246, 0.0008793757879175246,..."
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[0.5312957763671875, 0.5312957763671875, 0.531...","[0.0011636412236839533, 0.0011636412236839533,..."
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[0.07795494794845581, 0.07795494794845581, 0.0...","[0.0013425374636426568, 0.0013425374636426568,..."
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[0.24684298038482666, 0.24684298038482666, 0.0...","[0.00044112512841820717, 0.0004411251284182071..."
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[0.29858100414276123, 0.29858100414276123, 0.0...","[0.0009772243211045861, 0.0009772243211045861,..."
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[0.11491180956363678, 0.11491180956363678, 0.1...","[0.0008585219620727003, 0.0008585219620727003,..."


In [40]:
trn_df.isnull().mean()

textID             0.0
text               0.0
selected_text      0.0
sentiment          0.0
char_pred_start    0.0
char_pred_end      0.0
dtype: float64

In [41]:
!mkdir -p ../inputs/datasets/char_preds/e153

In [42]:
trn_df[['textID', 'char_pred_start', 'char_pred_end']].to_pickle('../inputs/datasets/char_preds/e153/e153_char_preds.pkl')

## e154

## noise analysis

In [2]:
trn_df = pd.read_csv('../inputs/origin/train.csv').dropna()
display(trn_df.shape, trn_df.head())

(27480, 4)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
ckpts = []
!mkdir -p ../checkpoints/e154/best_model_only/
for fckpt in glob('../checkpoints/e154/best/*.pth'):
    ckpt = torch.load(fckpt)
    ckpts.append(ckpt)
    fout_ckpt = {}
    fout_ckpt['model_state_dict'] = ckpt['model_state_dict']
    torch.save(fout_ckpt, re.sub('best', 'best_model_only', fckpt))

In [4]:
def _mk_char_preds(offsets, preds_head, preds_tail):
    char_preds_heads, char_preds_tails = [], []
    # 最初の４つは無視
    for offset, pred_head, pred_tail in tqdm(zip(offsets, preds_head, preds_tail)):
        char_preds_head, char_preds_tail = np.zeros(141), np.zeros(141)
        for offset_i, pred_head_i, pred_tail_i in zip(offset[4:], pred_head[4:], pred_tail[4:]):
            char_preds_head[offset_i[0]:offset_i[1]] = pred_head_i
            char_preds_tail[offset_i[0]:offset_i[1]] = pred_tail_i
        char_preds_heads.append(char_preds_head)
        char_preds_tails.append(char_preds_tail)
    char_preds_heads, char_preds_tails = np.asarray(char_preds_heads), np.asarray(char_preds_tails)
    return char_preds_heads, char_preds_tails

In [5]:
ckpts[0].keys()

dict_keys(['fold_num', 'current_epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'val_textIDs', 'val_input_ids', 'val_preds', 'val_labels', 'val_offsets', 'histories'])

In [6]:
char_preds = []

for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    val_char_preds_heads, val_char_preds_tails = _mk_char_preds(ckpt['val_offsets'], ckpt['val_preds'][0],  ckpt['val_preds'][1])
    for textID, val_char_preds_head, val_char_preds_tail in zip(ckpt['val_textIDs'], val_char_preds_heads, val_char_preds_tails):
        char_preds.append({'textID': textID, 'char_pred_start': val_char_preds_head, 'char_pred_end': val_char_preds_tail})
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_head'] = val_char_preds_heads
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_tail'] = val_char_preds_tails
    trn_df = trn_df.reset_index()
trn_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [7]:
char_preds_df = pd.DataFrame(char_preds)
char_preds_df.head()

Unnamed: 0,textID,char_pred_start,char_pred_end
0,cb774db0d1,"[0.9990531802177429, 0.9990531802177429, 0.000...","[8.84408873389475e-05, 8.84408873389475e-05, 2..."
1,549e992a42,"[0.4969521462917328, 0.4969521462917328, 0.496...","[0.00028423359617590904, 0.0002842335961759090..."
2,088c60f138,"[0.3698231279850006, 0.3698231279850006, 0.369...","[0.00030866972520016134, 0.0003086697252001613..."
3,9642c003ef,"[0.12465132027864456, 0.12465132027864456, 0.1...","[0.00037099936162121594, 0.0003709993616212159..."
4,358bd9e861,"[0.5271499752998352, 0.5271499752998352, 0.527...","[0.0005688770324923098, 0.0005688770324923098,..."


In [8]:
trn_df = trn_df.merge(char_preds_df, on='textID', how='left')
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,char_pred_start,char_pred_end
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[0.9990531802177429, 0.9990531802177429, 0.000...","[8.84408873389475e-05, 8.84408873389475e-05, 2..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[0.4969521462917328, 0.4969521462917328, 0.496...","[0.00028423359617590904, 0.0002842335961759090..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[0.3698231279850006, 0.3698231279850006, 0.369...","[0.00030866972520016134, 0.0003086697252001613..."
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[0.12465132027864456, 0.12465132027864456, 0.1...","[0.00037099936162121594, 0.0003709993616212159..."
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[0.5271499752998352, 0.5271499752998352, 0.527...","[0.0005688770324923098, 0.0005688770324923098,..."
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[0.0331970751285553, 0.0331970751285553, 0.033...","[0.0008952946518547833, 0.0008952946518547833,..."
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[0.1762423813343048, 0.1762423813343048, 0.000...","[0.0006348831811919808, 0.0006348831811919808,..."
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[0.37417903542518616, 0.37417903542518616, 0.0...","[0.0008316028397530317, 0.0008316028397530317,..."
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[0.1985953152179718, 0.1985953152179718, 0.198...","[0.001234533847309649, 0.001234533847309649, 0..."


In [9]:
trn_df.isnull().mean()

textID             0.0
text               0.0
selected_text      0.0
sentiment          0.0
char_pred_start    0.0
char_pred_end      0.0
dtype: float64

In [10]:
!mkdir -p ../inputs/datasets/char_preds/e154

In [11]:
trn_df[['textID', 'char_pred_start', 'char_pred_end']].to_pickle('../inputs/datasets/char_preds/e154/e154_char_preds.pkl')

## e155

## noise analysis

In [12]:
trn_df = pd.read_csv('../inputs/origin/train.csv').dropna()
display(trn_df.shape, trn_df.head())

(27480, 4)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [13]:
ckpts = []
!mkdir -p ../checkpoints/e155/best_model_only/
for fckpt in glob('../checkpoints/e155/best/*.pth'):
    ckpt = torch.load(fckpt)
    ckpts.append(ckpt)
    fout_ckpt = {}
    fout_ckpt['model_state_dict'] = ckpt['model_state_dict']
    torch.save(fout_ckpt, re.sub('best', 'best_model_only', fckpt))

In [14]:
def _mk_char_preds(offsets, preds_head, preds_tail):
    char_preds_heads, char_preds_tails = [], []
    # 最初の４つは無視
    for offset, pred_head, pred_tail in tqdm(zip(offsets, preds_head, preds_tail)):
        char_preds_head, char_preds_tail = np.zeros(141), np.zeros(141)
        for offset_i, pred_head_i, pred_tail_i in zip(offset[4:], pred_head[4:], pred_tail[4:]):
            char_preds_head[offset_i[0]:offset_i[1]] = pred_head_i
            char_preds_tail[offset_i[0]:offset_i[1]] = pred_tail_i
        char_preds_heads.append(char_preds_head)
        char_preds_tails.append(char_preds_tail)
    char_preds_heads, char_preds_tails = np.asarray(char_preds_heads), np.asarray(char_preds_tails)
    return char_preds_heads, char_preds_tails

In [15]:
ckpts[0].keys()

dict_keys(['fold_num', 'current_epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'val_textIDs', 'val_input_ids', 'val_preds', 'val_labels', 'val_offsets', 'histories'])

In [16]:
char_preds = []

for ckpt in ckpts:
    trn_df = trn_df.set_index('textID')
    val_char_preds_heads, val_char_preds_tails = _mk_char_preds(ckpt['val_offsets'], ckpt['val_preds'][0],  ckpt['val_preds'][1])
    for textID, val_char_preds_head, val_char_preds_tail in zip(ckpt['val_textIDs'], val_char_preds_heads, val_char_preds_tails):
        char_preds.append({'textID': textID, 'char_pred_start': val_char_preds_head, 'char_pred_end': val_char_preds_tail})
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_head'] = val_char_preds_heads
    # trn_df.loc[ckpt['val_textIDs'], 'val_char_pred_tail'] = val_char_preds_tails
    trn_df = trn_df.reset_index()
trn_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [17]:
char_preds_df = pd.DataFrame(char_preds)
char_preds_df.head()

Unnamed: 0,textID,char_pred_start,char_pred_end
0,cb774db0d1,"[0.9997105002403259, 0.9997105002403259, 3.479...","[1.4039649613550864e-05, 1.4039649613550864e-0..."
1,549e992a42,"[0.5332648754119873, 0.5332648754119873, 0.533...","[0.0002854808117263019, 0.0002854808117263019,..."
2,088c60f138,"[0.3835342526435852, 0.3835342526435852, 0.383...","[0.00037456306745298207, 0.0003745630674529820..."
3,9642c003ef,"[0.11737676709890366, 0.11737676709890366, 0.1...","[0.0005230818642303348, 0.0005230818642303348,..."
4,358bd9e861,"[0.5843451023101807, 0.5843451023101807, 0.584...","[0.0002049447357421741, 0.0002049447357421741,..."


In [18]:
trn_df = trn_df.merge(char_preds_df, on='textID', how='left')
trn_df

Unnamed: 0,textID,text,selected_text,sentiment,char_pred_start,char_pred_end
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[0.9997105002403259, 0.9997105002403259, 3.479...","[1.4039649613550864e-05, 1.4039649613550864e-0..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[0.5332648754119873, 0.5332648754119873, 0.533...","[0.0002854808117263019, 0.0002854808117263019,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[0.3835342526435852, 0.3835342526435852, 0.383...","[0.00037456306745298207, 0.0003745630674529820..."
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[0.11737676709890366, 0.11737676709890366, 0.1...","[0.0005230818642303348, 0.0005230818642303348,..."
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[0.5843451023101807, 0.5843451023101807, 0.584...","[0.0002049447357421741, 0.0002049447357421741,..."
...,...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[0.053992561995983124, 0.053992561995983124, 0...","[0.0022145104594528675, 0.0022145104594528675,..."
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[0.2607862651348114, 0.2607862651348114, 0.000...","[0.00017676873540040106, 0.0001767687354004010..."
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[0.3742068409919739, 0.3742068409919739, 0.001...","[0.0006870274082757533, 0.0006870274082757533,..."
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[0.20651544630527496, 0.20651544630527496, 0.2...","[0.00034741268609650433, 0.0003474126860965043..."


In [19]:
trn_df.isnull().mean()

textID             0.0
text               0.0
selected_text      0.0
sentiment          0.0
char_pred_start    0.0
char_pred_end      0.0
dtype: float64

In [20]:
!mkdir -p ../inputs/datasets/char_preds/e155

In [21]:
trn_df[['textID', 'char_pred_start', 'char_pred_end']].to_pickle('../inputs/datasets/char_preds/e155/e155_char_preds.pkl')