In [66]:
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel, RobertaModel

pd.set_option("display.max_rows", 300)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

## noise analysis

In [2]:
trn_df = pd.read_csv('../inputs/nes_info/e080_dataset_trn_df.csv').dropna()
display(trn_df.shape, trn_df.head())

(27423, 12)

Unnamed: 0,textID,text,selected_text,sentiment,my_text,my_selected_text,my_text_eq_my_selected_text,predicted_texts,manual_selected_text,selected_text_lower,manual_and_selected_intersection_len,manual_and_selected_tokenized_intersection_len
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going","I`d have responded, if I were going",True,"i`d have responded, if i were going","i`d have responded, if i were going","i`d have responded, if i were going",7,9
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,False,sooo sad,sooo sad,sooo sad,2,3
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...,bullying me,False,bullying,bullying me,bullying me,2,2
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone,leave me alone,False,leave me alone,leave me alone,leave me alone,3,3
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"Sons of ****, why couldn`t they put them on th...","Sons of ****,",False,"sons of ****,","sons of ****,","sons of ****,",3,4


In [3]:
import sys
sys.path.append('../')
from tools.tokenizers import myRobertaByteLevelBPETokenizer

tokenizer = myRobertaByteLevelBPETokenizer(   
    vocab_file='../inputs/datasets/roberta/tokenizer/vocab.json',
    merges_file='../inputs/datasets/roberta/tokenizer/merges.txt',
    lowercase=True,
    add_prefix_space=True)

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
tokenizer.add_tokens([
                '[S]',
                '[PERIOD]',
                '[EXCL]',
                '[QUES]',
            ])

4

In [39]:
trn_df[trn_df.selected_text_lower.str.contains(' \. \.')].query('sentiment != "neutral"').sample(1)

Unnamed: 0,textID,text,selected_text,sentiment,my_text,my_selected_text,my_text_eq_my_selected_text,predicted_texts,manual_selected_text,selected_text_lower,manual_and_selected_intersection_len,manual_and_selected_tokenized_intersection_len
11986,42df3695a8,"Ppl who smoke pot, are so f . . .n stupid. An ...","Ppl who smoke pot, are so f . . .n stupid. An ...",negative,"Ppl who smoke pot, are so f . . .n stupid. An ...","Ppl who smoke pot, are so f . . .n stupid. An ...",False,stupid.,"ppl who smoke pot, are so f . . .n stupid. an...","ppl who smoke pot, are so f . . .n stupid. an...",14,17


In [40]:
import re

row = trn_df.loc[11986]

tweet_base = re.sub(r' \.', '[S][PERIOD]', row['text'])
tweet_base = re.sub(r'\.', '[PERIOD]', tweet_base)
tweet_base = re.sub(' !', '[S][EXCL]', tweet_base)
tweet_base = re.sub('!', '[EXCL]', tweet_base)
tweet = " " + " ".join(tweet_base.split())
selected_text_base = re.sub(r' \.', '[S][PERIOD]', row['selected_text'])
selected_text_base = re.sub(r'\.', '[PERIOD]', selected_text_base)
selected_text_base = re.sub(' !', '[S][EXCL]', selected_text_base)
selected_text_base = re.sub('!', '[EXCL]', selected_text_base)
selected_text = " " + " ".join(selected_text_base.split())

In [41]:
print(row['text'])
print(tweet)
print(row['selected_text'])
print(selected_text)

Ppl who smoke pot, are so f . . .n stupid. An instant turn off. drugs in general. why? seriously! ! ! why?
 Ppl who smoke pot, are so f[S][PERIOD][S][PERIOD][S][PERIOD]n stupid[PERIOD] An instant turn off[PERIOD] drugs in general[PERIOD] why? seriously[EXCL][S][EXCL][S][EXCL] why?
Ppl who smoke pot, are so f . . .n stupid. An instant turn off.
 Ppl who smoke pot, are so f[S][PERIOD][S][PERIOD][S][PERIOD]n stupid[PERIOD] An instant turn off[PERIOD]


In [43]:
tokenizer.encode(selected_text).tokens

['Ġp',
 'pl',
 'Ġwho',
 'Ġsmoke',
 'Ġpot',
 ',',
 'Ġare',
 'Ġso',
 'Ġf',
 '[S]',
 '[PERIOD]',
 '[S]',
 '[PERIOD]',
 '[S]',
 '[PERIOD]',
 'Ġn',
 'Ġstupid',
 '[PERIOD]',
 'Ġan',
 'Ġinstant',
 'Ġturn',
 'Ġoff',
 '[PERIOD]']

In [62]:
tokenizer.encode('-').tokens

['Ġ-']

In [65]:
tokenizer.encode('^').ids

[37249]

In [50]:
tokenizer.decode([0])

'<s>'

In [46]:
a = re.sub('\[S\]', ' ', selected_text) 
re.sub('\[PERIOD\]', '.', a) 

' Ppl who smoke pot, are so f . . .n stupid. An instant turn off.'

In [54]:
model = RobertaModel.from_pretrained('roberta-base')

In [61]:
model.embeddings.word_embeddings.weight.data.clone()

tensor([[ 0.1476, -0.0365,  0.0753,  ..., -0.0023,  0.0172, -0.0016],
        [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        [-0.0347, -0.0873, -0.0180,  ...,  0.1174, -0.0098, -0.0355],
        ...,
        [ 0.0304,  0.0504, -0.0307,  ...,  0.0377,  0.0096,  0.0084],
        [ 0.0623, -0.0596,  0.0307,  ..., -0.0920,  0.1080, -0.0183],
        [ 0.1259, -0.0145,  0.0332,  ...,  0.0121,  0.0342,  0.0168]])