In [119]:
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel

pd.set_option("display.max_rows", 300)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

## noise analysis

In [120]:
trn_df = pd.read_csv('../inputs/nes_info/e080_dataset_trn_df.csv').dropna()
display(trn_df.shape, trn_df.head())

(27423, 12)

Unnamed: 0,textID,text,selected_text,sentiment,my_text,my_selected_text,my_text_eq_my_selected_text,predicted_texts,manual_selected_text,selected_text_lower,manual_and_selected_intersection_len,manual_and_selected_tokenized_intersection_len
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going","I`d have responded, if I were going",True,"i`d have responded, if i were going","i`d have responded, if i were going","i`d have responded, if i were going",7,9
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,False,sooo sad,sooo sad,sooo sad,2,3
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...,bullying me,False,bullying,bullying me,bullying me,2,2
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone,leave me alone,False,leave me alone,leave me alone,leave me alone,3,3
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"Sons of ****, why couldn`t they put them on th...","Sons of ****,",False,"sons of ****,","sons of ****,","sons of ****,",3,4


In [121]:
import sys
sys.path.append('../')
from tools.metrics import jaccard

In [122]:
trn_df['jaccard'] = trn_df.apply(lambda row: jaccard(row['predicted_texts'], row['selected_text_lower']), axis=1)

In [123]:
trn_df['manual_equal_select'] = trn_df.manual_selected_text.apply(lambda x: x.split()) == trn_df.selected_text_lower.apply(lambda x: x.split())
trn_df['manual_equal_select'].mean()

0.925974546913175

In [124]:
def kernel_pospro1(predicted_text):
    if len(predicted_text.split()) == 1: 
        predicted_text = predicted_text.replace('!!!!', '!')
        predicted_text = predicted_text.replace('..', '.')
        predicted_text = predicted_text.replace('...', '.')
    return predicted_text

trn_df['fixed_manual_selected_text'] = trn_df.manual_selected_text.apply(lambda x: kernel_pospro1(x))

trn_df['fixed_manual_equal_select'] = trn_df.fixed_manual_selected_text.apply(lambda x: x.split()) == trn_df.selected_text_lower.apply(lambda x: x.split())
trn_df['fixed_manual_equal_select'].mean()

0.9277613681945812

In [125]:
pd.crosstab(trn_df['fixed_manual_equal_select'], trn_df['manual_equal_select'])

manual_equal_select,False,True
fixed_manual_equal_select,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1926,55
True,104,25338


#### とりあえず
 - `先頭・お尻` について `[.!]・[alphabet][それ以外]` で分類してみる

In [131]:
# start offset != 0 の場合はお尻がズレている
trn_df['start_offset'] = trn_df.apply(lambda row: " ".join(row['manual_selected_text'].split()).find(" ".join(row['selected_text_lower'].split())), axis=1)
trn_df['end_offset'] = trn_df.apply(lambda row: " ".join(row['manual_selected_text'].split())[::-1].find(" ".join(row['selected_text_lower'].split())[::-1]), axis=1)
trn_df.query('sentiment != "neutral" and not manual_equal_select')['start_offset'].value_counts()

0     909
2     303
1     299
3     145
4      60
5      46
6      27
8       9
7       8
9       2
11      1
10      1
Name: start_offset, dtype: int64

In [128]:
# start offset != 0 の場合はお尻がズレている
trn_df['fixed_start_offset'] = trn_df.apply(lambda row: " ".join(row['fixed_manual_selected_text'].split()).find(" ".join(row['selected_text_lower'].split())), axis=1)
trn_df['fixed_end_offset'] = trn_df.apply(lambda row: " ".join(row['fixed_manual_selected_text'].split())[::-1].find(" ".join(row['selected_text_lower'].split())[::-1]), axis=1)
trn_df.query('sentiment != "neutral" and not fixed_manual_equal_select')['fixed_start_offset'].value_counts()

 0     799
 1     309
 2     288
 3     142
-1      62
 4      59
 5      46
 6      27
 8       9
 7       8
 9       2
 11      1
 10      1
Name: fixed_start_offset, dtype: int64

In [10]:
pd.crosstab((trn_df.query('not manual_equal_select').start_offset != 0).rename('start offset is not 0'), (trn_df.query('not manual_equal_select').end_offset != 0).rename('end offset is not 0'), margins=True)

end offset is not 0,False,True,All
start offset is not 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0,1091,1091
True,822,117,939
All,822,1208,2030


In [11]:
trn_df.query('sentiment != "neutral" and not manual_equal_select and start_offset != 0 and end_offset != 0')[['sentiment', 'text', 'selected_text_lower', 'manual_selected_text', 'predicted_texts']]

Unnamed: 0,sentiment,text,selected_text_lower,manual_selected_text,predicted_texts
309,positive,"I know It was worth a shot, though!",as wort,was worth,worth
756,positive,Thanks Amy! That video is so awesome! Did y...,e`s amazi,he`s amazing,thanks amy! that video is so awesome!
808,positive,"Yes, you should go see Star Trek! It`s sooo...",ch fu,much fun,it`s sooooo much fun!
1076,positive,Up is out? I didn`t get the memo It looks ...,o it looks amazi,memo it looks amazing,it looks amazing.
1362,positive,"hey i loved ACS but i had to see it online, i...","y i loved acs but i had to see it online, is ...","hey i loved acs but i had to see it online, i...",you look amazing
1446,positive,oh ok. Well good for you...can I get some wea...,l good for you.,well good for you...,good for you...
1517,negative,haha IE...it`s expensive!!!! Hm...let me mess...,.it`s expensive!,...it`s expensive!!!!,expensive!!!!
1547,positive,had a little family party tonight hope it ...,t hope it rock,tonight hope it rocked,hope it rocked
2007,negative,school and the guy i like was talking to the g...,.ughhhh!,...ughhhh!!,ppl are just sooo...ughhhh!!
2224,positive,Thank you so much. That was so nice of you a...,d i was happy to hear you voice you`ve really...,and i was happy to hear you voice you`ve real...,thank you so much. that was so nice of you an...


In [126]:
trn_df['fixed_start_with_[.!]'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[.!]')
trn_df['fixed_start_with_.'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[.]')
trn_df['fixed_start_with_,'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[,]')
trn_df['fixed_start_with_!'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[!]')
trn_df['fixed_start_with_?'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[?]')
trn_df['fixed_end_by_[.!]'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[.!]$')
trn_df['fixed_end_by_.'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[.]$')
trn_df['fixed_end_by_,'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[,]$')
trn_df['fixed_end_by_!'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[!]$')
trn_df['fixed_end_by_?'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[?]$')
trn_df['fixed_start_with_alphabet'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[a-zA-Z]')
trn_df['fixed_end_by_alphabet'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[a-zA-Z]$')
trn_df['fixed_start_with_number'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[0-9]')
trn_df['fixed_end_by_number'] = trn_df.fixed_manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[0-9]$')

In [127]:
trn_df['start_with_[.!]'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[.!]')
trn_df['start_with_.'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[.]')
trn_df['start_with_,'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[,]')
trn_df['start_with_!'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[!]')
trn_df['start_with_?'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[?]')
trn_df['end_by_[.!]'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[.!]$')
trn_df['end_by_.'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[.]$')
trn_df['end_by_,'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[,]$')
trn_df['end_by_!'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[!]$')
trn_df['end_by_?'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[?]$')
trn_df['start_with_alphabet'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[a-zA-Z]')
trn_df['end_by_alphabet'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[a-zA-Z]$')
trn_df['start_with_number'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('^[0-9]')
trn_df['end_by_number'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.contains('[0-9]$')

In [129]:
start_offset_nonzero_trn_df = target_df.query('start_offset != 0')
pd.crosstab(
    (start_offset_nonzero_trn_df.start_offset != 0).rename('start_offset is not 0'),
    [start_offset_nonzero_trn_df['start_with_.'], start_offset_nonzero_trn_df['start_with_,'], start_offset_nonzero_trn_df['start_with_!'], start_offset_nonzero_trn_df['start_with_?'], start_offset_nonzero_trn_df['start_with_alphabet'], start_offset_nonzero_trn_df['start_with_number']],
    margins=True
)

start_with_.,False,False,False,False,False,True,All
"start_with_,",False,False,False,False,False,False,Unnamed: 7_level_1
start_with_!,False,False,False,False,True,False,Unnamed: 7_level_2
start_with_?,False,False,False,True,False,False,Unnamed: 7_level_3
start_with_alphabet,False,False,True,False,False,False,Unnamed: 7_level_4
start_with_number,False,True,False,False,False,False,Unnamed: 7_level_5
start_offset is not 0,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6
True,8,2,762,8,18,141,939
All,8,2,762,8,18,141,939


In [133]:
pd.crosstab(
    (trn_df.start_offset != 0).rename('start_offset is not 0'),
    [trn_df['start_with_.'], trn_df['start_with_,'], trn_df['start_with_!'], trn_df['start_with_?'], trn_df['start_with_alphabet'], trn_df['start_with_number']],
    margins=True
)

start_with_.,False,False,False,False,False,False,True,All
"start_with_,",False,False,False,False,False,True,False,Unnamed: 8_level_1
start_with_!,False,False,False,False,True,False,False,Unnamed: 8_level_2
start_with_?,False,False,False,True,False,False,False,Unnamed: 8_level_3
start_with_alphabet,False,False,True,False,False,False,False,Unnamed: 8_level_4
start_with_number,False,True,False,False,False,False,False,Unnamed: 8_level_5
start_offset is not 0,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6
False,923,99,25108,51,54,71,178,26484
True,8,2,762,8,18,0,141,939
All,931,101,25870,59,72,71,319,27423


In [134]:
pd.crosstab(
    (trn_df.start_offset != 0).rename('fixed_start_offset is not 0'),
    [trn_df['fixed_start_with_.'], trn_df['fixed_start_with_,'], trn_df['fixed_start_with_!'], trn_df['fixed_start_with_?'], trn_df['fixed_start_with_alphabet'], trn_df['fixed_start_with_number']],
    margins=True
)

fixed_start_with_.,False,False,False,False,False,False,True,All
"fixed_start_with_,",False,False,False,False,False,True,False,Unnamed: 8_level_1
fixed_start_with_!,False,False,False,False,True,False,False,Unnamed: 8_level_2
fixed_start_with_?,False,False,False,True,False,False,False,Unnamed: 8_level_3
fixed_start_with_alphabet,False,False,True,False,False,False,False,Unnamed: 8_level_4
fixed_start_with_number,False,True,False,False,False,False,False,Unnamed: 8_level_5
fixed_start_offset is not 0,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6
False,923,99,25108,51,54,71,178,26484
True,8,2,762,8,18,0,141,939
All,931,101,25870,59,72,71,319,27423


In [135]:
pd.crosstab([trn_df['fixed_manual_equal_select'], trn_df['manual_equal_select']], [trn_df['fixed_start_with_.'], trn_df['fixed_start_with_!'], trn_df['fixed_end_by_.'], trn_df['fixed_end_by_!']])

Unnamed: 0_level_0,fixed_start_with_.,False,False,False,False,False,False,True,True,True
Unnamed: 0_level_1,fixed_start_with_!,False,False,False,True,True,True,False,False,False
Unnamed: 0_level_2,fixed_end_by_.,False,False,True,False,False,True,False,False,True
Unnamed: 0_level_3,fixed_end_by_!,False,True,False,False,True,False,False,True,False
fixed_manual_equal_select,manual_equal_select,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
False,False,1243,189,312,23,4,1,108,16,30
False,True,3,4,48,0,0,0,0,0,0
True,False,0,12,83,0,0,0,7,0,2
True,True,18057,2999,4082,26,10,8,110,13,33


In [137]:
trn_df['split_len'] = trn_df.manual_selected_text.apply(lambda x: len(x.split()))

In [138]:
single_trn_df = trn_df.query('split_len == 1')
pd.crosstab([single_trn_df['fixed_manual_equal_select'], single_trn_df['manual_equal_select']], [single_trn_df['fixed_start_with_.'], single_trn_df['fixed_start_with_!'], single_trn_df['fixed_end_by_.'], single_trn_df['fixed_end_by_!']])

Unnamed: 0_level_0,fixed_start_with_.,False,False,False,False,True,True,True
Unnamed: 0_level_1,fixed_start_with_!,False,False,False,True,False,False,False
Unnamed: 0_level_2,fixed_end_by_.,False,False,True,False,False,False,True
Unnamed: 0_level_3,fixed_end_by_!,False,True,False,False,False,True,False
fixed_manual_equal_select,manual_equal_select,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4
False,False,49,62,42,1,16,4,0
False,True,3,4,48,0,0,0,0
True,False,0,12,83,0,7,0,2
True,True,5534,479,505,0,4,0,0


## いったん . をターゲットにやってみる

In [139]:
trn_df['manual_start_period_num'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.extract('^(\.+)', expand=False).str.len()
trn_df['manual_end_period_num'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.extract('(\.+)$', expand=False).str.len()
trn_df['manual_start_exclamation_num'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.extract('^(!+)', expand=False).str.len()
trn_df['manual_end_exclamation_num'] = trn_df.manual_selected_text.apply(lambda x: ' '.join(x.split())).str.extract('(!+)$', expand=False).str.len()

trn_df['selected_start_period_num'] = trn_df.selected_text_lower.apply(lambda x: ' '.join(x.split())).str.extract('^(\.+)', expand=False).str.len()
trn_df['selected_end_period_num'] = trn_df.selected_text_lower.apply(lambda x: ' '.join(x.split())).str.extract('(\.+)$', expand=False).str.len()
trn_df['selected_start_exclamation_num'] = trn_df.selected_text_lower.apply(lambda x: ' '.join(x.split())).str.extract('^(!+)', expand=False).str.len()
trn_df['selected_end_exclamation_num'] = trn_df.selected_text_lower.apply(lambda x: ' '.join(x.split())).str.extract('(!+)$', expand=False).str.len()

In [141]:
pd.crosstab(trn_df['manual_start_period_num'].fillna(0.).astype(int), trn_df['selected_start_period_num'].fillna(0.).astype(int))

selected_start_period_num,0,1,2,3,4,6
manual_start_period_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27102,2,0,0,0,0
1,2,128,0,0,0,0
2,0,23,12,0,0,0
3,0,74,19,30,0,0
4,0,18,1,0,7,0
5,0,1,0,0,0,0
6,0,0,0,0,0,1
7,0,1,1,0,0,0
8,0,0,0,1,0,0


In [142]:
pd.crosstab(trn_df['manual_end_period_num'].fillna(0.).astype(int), trn_df['selected_end_period_num'].fillna(0.).astype(int))

selected_end_period_num,0,1,2,3,4,5,6,7,8,15,16,29
manual_end_period_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,22822,2,0,0,0,0,0,0,0,0,0,0
1,1,3504,0,0,0,0,0,0,0,0,0,0
2,0,42,223,0,0,0,0,0,0,0,0,0
3,0,95,133,421,0,0,0,0,0,0,0,0
4,0,17,26,7,61,0,0,0,0,0,0,0
5,0,10,4,0,2,25,0,0,0,0,0,0
6,0,1,4,1,0,0,6,0,0,0,0,0
7,0,0,3,0,1,0,0,5,0,0,0,0
8,0,0,0,1,0,0,0,0,1,0,0,0
15,0,0,0,0,0,0,0,0,0,1,0,0


In [144]:
single_trn_df = trn_df.query('split_len == 1')
display(pd.crosstab(single_trn_df['manual_start_period_num'].fillna(0.).astype(int), single_trn_df['selected_start_period_num'].fillna(0.).astype(int)))
display(pd.crosstab(single_trn_df['manual_end_period_num'].fillna(0.).astype(int), single_trn_df['selected_end_period_num'].fillna(0.).astype(int)))

selected_start_period_num,0,1,2,3
manual_start_period_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6822,0,0,0
1,0,5,0,0
2,0,3,0,0
3,0,14,4,3
4,0,3,0,0
5,0,1,0,0


selected_end_period_num,0,1,2,3,4,16
manual_end_period_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6174,1,0,0,0,0
1,0,506,0,0,0,0
2,0,20,34,0,0,0
3,0,30,47,10,0,0
4,0,7,10,1,2,0
5,0,5,3,0,0,0
6,0,1,1,0,0,0
7,0,0,2,0,0,0
16,0,0,0,0,0,1


In [146]:
single_trn_df = trn_df.query('split_len == 1')
display(pd.crosstab(single_trn_df['manual_start_exclamation_num'].fillna(0.).astype(int), single_trn_df['selected_start_exclamation_num'].fillna(0.).astype(int)))
display(pd.crosstab(single_trn_df['manual_end_exclamation_num'].fillna(0.).astype(int), single_trn_df['selected_end_exclamation_num'].fillna(0.).astype(int)))

selected_start_exclamation_num,0,1
manual_start_exclamation_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6854,0
2,0,1


selected_end_exclamation_num,0,1,2,3,5,9
manual_end_exclamation_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6294,0,0,0,0,0
1,2,443,0,0,0,0
2,0,21,29,0,0,0
3,0,13,20,11,0,0
4,0,10,6,0,0,0
5,0,1,1,0,1,0
8,0,1,1,0,0,0
9,0,0,0,0,0,1


#### とりあえずの regex 作成
 - 長い方から

In [None]:
re.sub('^(\.+)', '.', '...apple')
re.sub('\.\.\.\.\.\.\.$', '..', '...apple')
re.sub('\.\.\.\.\.\.$', '..', '...apple')
re.sub('\.\.\.\.\.$', '.', '...apple')
re.sub('\.\.\.\.$', '..', '...apple')
re.sub('\.\.\.$', '..', '...apple')
re.sub('\.\.\.$', '..', '...apple')
re.sub('!!!!!!!!$', '!', '...apple!')
re.sub('!!!!!$', '!', '...apple!')
re.sub('!!!!$', '!', '...apple!')
re.sub('!!!$', '!!', '...apple!')

In [168]:
single_trn_df = trn_df.query('split_len == 12')
display(pd.crosstab(single_trn_df['manual_start_period_num'].fillna(0.).astype(int), single_trn_df['selected_start_period_num'].fillna(0.).astype(int)))
display(pd.crosstab(single_trn_df['manual_end_period_num'].fillna(0.).astype(int), single_trn_df['selected_end_period_num'].fillna(0.).astype(int)))

selected_start_period_num,0,1,3
manual_start_period_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,643,0,0
1,0,1,0
3,0,1,1


selected_end_period_num,0,1,2,3,4
manual_end_period_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,504,0,0,0,0
1,0,108,0,0,0
2,0,0,5,0,0
3,0,0,2,24,0
4,0,0,0,0,3


In [193]:
single_trn_df = trn_df.query('split_len == 10')
display(pd.crosstab(single_trn_df['manual_start_exclamation_num'].fillna(0.).astype(int), single_trn_df['selected_start_exclamation_num'].fillna(0.).astype(int)))
display(pd.crosstab(single_trn_df['manual_end_exclamation_num'].fillna(0.).astype(int), single_trn_df['selected_end_exclamation_num'].fillna(0.).astype(int)))

selected_start_exclamation_num,0,1,3
manual_start_exclamation_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,777,0,0
1,0,1,0
3,0,1,0
4,0,0,1


selected_end_exclamation_num,0,1,2,3
manual_end_exclamation_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,685,0,0,0
1,0,75,0,0
2,0,0,11,0
3,0,1,1,7


#### alphabet 見てみる

In [194]:
single_trn_df = trn_df.query('split_len == 1')

In [195]:
single_trn_df.query('fixed_start_with_alphabet and start_offset != 0')

Unnamed: 0,textID,text,selected_text,sentiment,my_text,my_selected_text,my_text_eq_my_selected_text,predicted_texts,manual_selected_text,selected_text_lower,...,end_offset,split_len,manual_start_period_num,manual_end_period_num,manual_start_exclamation_num,manual_end_exclamation_num,selected_start_period_num,selected_end_period_num,selected_start_exclamation_num,selected_end_exclamation_num
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,is back home now gonna miss every one,onna,False,miss,gonna,onna,...,0,1,,,,,,,,
1695,6dbdb64223,_face and nana.wish i was there last night,a.wish,positive,_face and nana.wish i was there last night,a.wish,False,wish,ana.wish,a.wish,...,0,1,,,,,,,,
5188,9df7f02404,http://twitpic.com/4wc1y - Cracking myself mor...,sFunF,positive,http://twitpic.com/4wc1y - Cracking myself mor...,sFunF,False,cracking,isfunfor,sfunf,...,2,1,,,,,,,,
5435,780c673bca,going out for the good ol` `soak` tonight for ...,ing,positive,going out for the good ol` `soak` tonight for ...,ing,False,in love,going,ing,...,0,1,,,,,,,,
8728,12f21c8f19,star wars ............ is **** BOO??? i wanna...,l,positive,star wars ............ is **** BOO??? i wanna ...,l,False,lol,school,l,...,0,1,,,,,,,,
11807,96ff964db0,"4 hours of sleep, a migraine, again? What is w...",hat,negative,"4 hours of sleep, a migraine, again? What is w...",hat,False,hate my life,what,hat,...,0,1,,,,,,,,
12404,df398a774e,"alright...I`m going to get off comp now, go ...",ave,positive,"alright...I`m going to get off comp now, go ba...",ave,False,be safe & have fun!,have,ave,...,0,1,,,,,,,,
14171,c155efab1b,Hey Honey Bunny here big bunny hugs,unny,positive,Hey Honey Bunny here big bunny hugs,unny,False,hugs,bunny,unny,...,0,1,,,,,,,,
15206,4b3fd4ec22,Last day at DMA over! a million sad faces.,ion,negative,Last day at DMA over! a million sad faces.,ion,False,sad,million,ion,...,0,1,,,,,,,,
17753,7375cfdd5b,read your status update & just wanted 2 b sur...,l~blessings,positive,read your status update & just wanted 2 b sure...,l~blessings,False,hope u r having a wondefl,ichael~blessings,l~blessings,...,0,1,,,,,,,,


In [197]:
single_trn_df.query('fixed_end_by_alphabet and end_offset != 0')[['manual_selected_text', 'selected_text_lower']]

Unnamed: 0,manual_selected_text,selected_text_lower
491,sorry,sorr
1318,shoesshoesshoes.yayyayyay.lol.iw,shoesshoesshoes.yayyayyay.lol.i
1727,lovely,lovel
1797,happy,happ
4317,"disapointed,good","disapointed,g"
4343,good,goo
5188,isfunfor,sfunf
5509,awesome!was,awesome!wa
6803,love,lov
6832,bored,bore


In [199]:
single_trn_df[single_trn_df.manual_selected_text.str.contains('sorry')]

Unnamed: 0,textID,text,selected_text,sentiment,my_text,my_selected_text,my_text_eq_my_selected_text,predicted_texts,manual_selected_text,selected_text_lower,...,end_offset,split_len,manual_start_period_num,manual_end_period_num,manual_start_exclamation_num,manual_end_exclamation_num,selected_start_period_num,selected_end_period_num,selected_start_exclamation_num,selected_end_exclamation_num
284,05a0e60f99,I`m sorry to hear that.,sorry,negative,I`m sorry to hear that.,sorry,False,i`m sorry,sorry,sorry,...,0,1,,,,,,,,
319,b5dd587dd2,Sorry to hear ur flight got cancelled that b...,Sorry,negative,Sorry to hear ur flight got cancelled that blo...,Sorry,False,sorry,sorry,sorry,...,0,1,,,,,,,,
350,1f581d48bc,Sorry RB is on PS3 for me,Sorry,negative,Sorry RB is on PS3 for me,Sorry,False,sorry,sorry,sorry,...,0,1,,,,,,,,
397,ebc8565a98,"nothing aimed at you, just joining in...sorry",.sorry,negative,"nothing aimed at you, just joining in...sorry",.sorry,False,sorry,...sorry,.sorry,...,0,1,3.0,,,,1.0,,,
491,ee9df322d1,"Sorry, we`ll try to keep it down.",Sorr,negative,"Sorry, we`ll try to keep it down.",Sorr,False,"sorry,",sorry,sorr,...,1,1,,,,,,,,
573,c3bdaf65c9,sorry guys i didn`t sign in for a while sorry...,sorry,negative,sorry guys i didn`t sign in for a while sorry ...,sorry,False,sorry,sorry,sorry,...,0,1,,,,,,,,
639,bfa9cdee68,sorry to tweet about BGT but poor wonderful cr...,sorry,negative,sorry to tweet about BGT but poor wonderful cr...,sorry,False,sorry to tweet about bgt but poor wonderful c...,sorry,sorry,...,0,1,,,,,,,,
881,a2e0a7c278,sorry we didn`t get a chance to chat at #scre...,sorry,negative,sorry we didn`t get a chance to chat at #scree...,sorry,False,sorry,sorry,sorry,...,0,1,,,,,,,,
1226,211077a9e9,OK! Oops sorry,sorry,negative,OK! Oops sorry,sorry,False,sorry,sorry,sorry,...,0,1,,,,,,,,
1643,04068059fa,: aww im sorry honey. that stinks,sorry,negative,: aww im sorry honey. that stinks,sorry,False,aww im sorry honey. that stinks,sorry,sorry,...,0,1,,,,,,,,


In [79]:
a = trn_df.query('manual_start_period_num != 1 and selected_start_period_num == 1')
a.shape

(119, 46)

In [80]:
a.manual_selected_text.apply(lambda x: len(x.split())).value_counts()

5     25
2     21
1     21
4     12
6     10
3     10
7      9
9      3
8      3
23     1
15     1
12     1
11     1
10     1
Name: manual_selected_text, dtype: int64

In [81]:
a

Unnamed: 0,textID,text,selected_text,sentiment,my_text,my_selected_text,my_text_eq_my_selected_text,predicted_texts,manual_selected_text,selected_text_lower,...,sel_start_exclamation_num,sel_end_exclamation_num,selected_start_period_num,selected_end_period_num,selected_start_exclamation_num,selected_end_exclamation_num,manual_start_period_num,manual_end_period_num,manual_start_exclamation_num,manual_end_exclamation_num
27,bdc32ea43c,On the way to Malaysia...no internet access to...,.no internet,negative,On the way to Malaysia...no internet access to...,.no internet,False,no internet access,...no internet,.no internet,...,,,1.0,,,,3.0,,,
160,e4e9b8713a,My back hurts...really bad,.really bad,negative,My back hurts...really bad,.really bad,False,my back hurts...really bad,...really bad,.really bad,...,,,1.0,,,,3.0,,,
295,59e77db781,Watching Ellen Love her!! then doing the dishe...,.it`s gorgeous out!,positive,Watching Ellen Love her!! then doing the dishe...,.it`s gorgeous out!,False,gorgeous,...it`s gorgeous out!,.it`s gorgeous out!,...,,1.0,1.0,,,1.0,3.0,,,1.0
397,ebc8565a98,"nothing aimed at you, just joining in...sorry",.sorry,negative,"nothing aimed at you, just joining in...sorry",.sorry,False,sorry,...sorry,.sorry,...,,,1.0,,,,3.0,,,
532,781d900e89,Beto`s Pizzeria is on Banksville Rd in I beli...,. Sorry,negative,Beto`s Pizzeria is on Banksville Rd in I belie...,. Sorry,False,sorry,... sorry,. sorry,...,,,1.0,,,,3.0,,,
664,3f54e1d2cb,...i`m sorry about you are still sick u know...,.i`m sorry,negative,...i`m sorry about you are still sick u know m...,.i`m sorry,False,...i`m sorry about you are still sick,...i`m sorry,.i`m sorry,...,,,1.0,,,,3.0,,,
677,b36197ce92,Crisis: forgot my fringe comb I`m with men......,.no one will help me,negative,Crisis: forgot my fringe comb I`m with men.......,.no one will help me,False,crisis: forgot my fringe comb i`m with men......,....no one will help me,.no one will help me,...,,,1.0,,,,4.0,,,
736,0bc2e46e4e,Thanks! My mom`s seed is larger and already ...,. I hope Avalina isn`t a dud!,negative,Thanks! My mom`s seed is larger and already cr...,. I hope Avalina isn`t a dud!,False,dud!,). i hope avalina isn`t a dud!,. i hope avalina isn`t a dud!,...,,1.0,1.0,,,1.0,,,,1.0
998,6d2170e60f,I`ve been unlocked for decades now...just not...,".just not lucky, never have been.",negative,I`ve been unlocked for decades now...just not ...,".just not lucky, never have been.",False,"not lucky,","...just not lucky, never have been.",".just not lucky, never have been.",...,,,1.0,1.0,,,3.0,1.0,,
1095,14c67c535b,Depends on what they want for it....I`ve beco...,.I`ve become poor again,negative,Depends on what they want for it....I`ve becom...,.I`ve become poor again,False,poor again,....i`ve become poor again,.i`ve become poor again,...,,,1.0,,,,4.0,,,


In [118]:
import re
# re.sub('.apple', '^\.', '.')
re.sub('^(\.+)', '.', '...apple')

'.apple'

In [202]:
trn_df.text.str.contains('%%').sum()

0

In [204]:
trn_df.text.str.contains('\^').sum()

22

## check tokenized length

In [205]:
import sys
sys.path.append('../')
from tools.tokenizers import myRobertaByteLevelBPETokenizer

tokenizer = myRobertaByteLevelBPETokenizer(   
    vocab_file='../inputs/datasets/roberta/tokenizer/vocab.json',
    merges_file='../inputs/datasets/roberta/tokenizer/merges.txt',
    lowercase=True,
    add_prefix_space=True)

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [207]:
def add_token(text):
    text = re.sub('\.', ' %%', text)
    text = re.sub('!', ' ##', text)
    return text

trn_df['token_added_text'] = trn_df.text.apply(add_token)

In [217]:
trn_df['token_len'] = trn_df['token_added_text'].apply(lambda x: len(tokenizer.encode(' '.join(x.split())).ids))
trn_df['token_len'].max()

92

In [218]:
tokenizer.encode(trn_df.query('token_len == 92').text.iloc[0]).ids

[1423,
 3361,
 29,
 6,
 7462,
 16,
 145,
 31095,
 3422,
 328,
 1437,
 24,
 45912,
 90,
 5556,
 42514,
 45912,
 958,
 42514,
 734,
 560,
 45912,
 12592,
 42514,
 734,
 102,
 42514,
 7586,
 10534,
 8596,
 43796]

In [219]:
trn_df.query('token_len == 92').text.iloc[0]

'YESSS, FLASH IS BEING **** TONIGHT!  It.........takes.................time...........to.........open...........a..........webpage......'

In [221]:
trn_df.query('token_len == 92').selected_text.iloc[0]

'FLASH IS BEING **** TONIGHT!'

In [220]:
trn_df.query('token_len == 92').token_added_text.iloc[0]

'YESSS, FLASH IS BEING **** TONIGHT ##  It %% %% %% %% %% %% %% %% %%takes %% %% %% %% %% %% %% %% %% %% %% %% %% %% %% %% %%time %% %% %% %% %% %% %% %% %% %% %%to %% %% %% %% %% %% %% %% %%open %% %% %% %% %% %% %% %% %% %% %%a %% %% %% %% %% %% %% %% %% %%webpage %% %% %% %% %% %%'