In [21]:
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, BertModel

pd.set_option("display.max_rows", 300)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

## 各種 dataset を比較

In [11]:
ls ../inputs/nes_info

invalid_labels.csv     my_dataset_trn_df.csv  v3_dataset_trn_df.csv
invalid_labels_v2.csv  v2_dataset_trn_df.csv


In [12]:
dataset_df = pd.read_csv('../inputs/nes_info/my_dataset_trn_df.csv')
datasetv2_df = pd.read_csv('../inputs/nes_info/v2_dataset_trn_df.csv')
datasetv3_df = pd.read_csv('../inputs/nes_info/v3_dataset_trn_df.csv')

In [13]:
dataset_df['manual_equal_selected'] = dataset_df['manual_selected_text'] == dataset_df['selected_text_lower']
datasetv2_df['manual_equal_selected'] = datasetv2_df['manual_selected_text'] == datasetv2_df['selected_text_lower']
datasetv3_df['manual_equal_selected'] = datasetv3_df['manual_selected_text'] == datasetv3_df['selected_text_lower']

In [14]:
dataset_df['manual_equal_selected_v2'] = datasetv2_df['manual_equal_selected']
dataset_df['manual_equal_selected_v3'] = datasetv3_df['manual_equal_selected']

dataset_df['manual_selected_text_v2'] = datasetv2_df['manual_selected_text']
dataset_df['manual_selected_text_v3'] = datasetv3_df['manual_selected_text']

dataset_df['manual_and_selected_tokenized_intersection_len_v2'] = datasetv2_df['manual_and_selected_tokenized_intersection_len']
dataset_df['manual_and_selected_tokenized_intersection_len_v3'] = datasetv3_df['manual_and_selected_tokenized_intersection_len']

In [19]:
dataset_df.query('manual_equal_selected != manual_equal_selected_v2 or manual_equal_selected != manual_equal_selected_v3')[['manual_equal_selected', 'manual_equal_selected_v2', 'manual_equal_selected_v3', 'selected_text', 'manual_selected_text', 'manual_selected_text_v2', 'manual_selected_text_v3', ]].sample(300)

Unnamed: 0,manual_equal_selected,manual_equal_selected_v2,manual_equal_selected_v3,selected_text,manual_selected_text,manual_selected_text_v2,manual_selected_text_v3
23098,False,True,True,Wow! I am so proud Its great.,. wow! i am so proud its great,wow! i am so proud its great.,wow! i am so proud its great.
1976,False,True,True,Enjoy my li`l night owls!,! enjoy my li`l night owls,enjoy my li`l night owls!,enjoy my li`l night owls!
7778,False,True,True,That sounds good.,. that sounds good,that sounds good.,that sounds good.
17828,True,False,True,excited,excited,so excited,excited
25316,False,True,True,never learned how to write in French- just bas...,- never learned how to write in french- just b...,never learned how to write in french- just ba...,never learned how to write in french- just ba...
5137,False,True,True,lower than zero..,.. lower than zero,lower than zero..,lower than zero..
337,False,True,True,That sucks!,! that sucks,that sucks!,that sucks!
3006,False,True,True,Apparently not.,. apparently not,apparently not.,apparently not.
8924,False,True,True,but can i find a sitter? wtf? why does everyon...,. but can i find a sitter? wtf? why does every...,but can i find a sitter? wtf? why does everyo...,but can i find a sitter? wtf? why does everyo...
15023,False,True,True,"smile, and count your blessings everyday. Your...",", smile, and count your blessings everyday. yo...","smile, and count your blessings everyday. you...","smile, and count your blessings everyday. you..."


In [27]:
# v3 が一番よくみえる...
dataset_df.query('not manual_equal_selected and manual_equal_selected == manual_equal_selected_v2 and manual_equal_selected == manual_equal_selected_v3')[['manual_equal_selected', 'manual_equal_selected_v2', 'manual_equal_selected_v3', 'selected_text_lower', 'manual_selected_text', 'manual_selected_text_v2', 'manual_selected_text_v3', ]].sample(300)

Unnamed: 0,manual_equal_selected,manual_equal_selected_v2,manual_equal_selected_v3,selected_text_lower,manual_selected_text,manual_selected_text_v2,manual_selected_text_v3
25379,False,False,False,eels slightly better but **** i sure do miss ...,feels slightly better but **** i sure do miss...,feels slightly better but **** i sure do miss...,feels slightly better but **** i sure do miss...
5989,False,False,False,r i`m excited anyway.,whatever i`m excited anyway.,ohwwww whatever i`m excited anyway.,whatever i`m excited anyway.
5774,False,False,False,a sad,kinda sad,kenny u alive!!!...i`m here getting da hair d...,kinda sad
18341,False,False,False,nice sw,everyone nice,nice swe,nice swe
10258,False,False,False,shame ab,. shame,shame about,shame about
8275,False,False,False,is looking goo,nip is looking,is looking good,is looking good
17347,False,False,False,bad..,really bad,bad...,bad...
8150,False,False,False,h headache,gh headache,ugh headache i just wanna go home,gh headache
23970,False,False,False,we are winners!!,we are winners!!,"heeey, we won ice hockey....we are winners!!",we are winners!!
10254,False,False,False,hired all the smart monkeys for e-mail market...,hired all the smart monkeys for e-mail market...,hired all the smart monkeys for e-mail market...,hired all the smart monkeys for e-mail market...


In [26]:
# v3 が一番良さそう
dataset_df['equal_shape'] = dataset_df.manual_equal_selected.astype(str) + '_' + dataset_df.manual_equal_selected_v2.astype(str) + '_' + dataset_df.manual_equal_selected_v3.astype(str)
dataset_df['equal_shape'].value_counts()

True_True_True       24291
False_False_False     2594
False_True_True        505
True_False_True         62
True_True_False         23
False_False_True         3
True_False_False         2
Name: equal_shape, dtype: int64

In [32]:
import sys
sys.path.append('../tools')
from metrics import jaccard

score = 0
for i, row in dataset_df.iterrows():
    score += jaccard(row['selected_text_lower'], row['manual_selected_text'])

score /= len(dataset_df)
score

0.9465257944869784

In [33]:
import sys
sys.path.append('../tools')
from metrics import jaccard

score = 0
for i, row in dataset_df.iterrows():
    score += jaccard(row['selected_text_lower'], row['manual_selected_text_v2'])

score /= len(dataset_df)
score

0.9334743137220625

In [34]:
import sys
sys.path.append('../tools')
from metrics import jaccard

score = 0
for i, row in dataset_df.iterrows():
    score += jaccard(row['selected_text_lower'], row['manual_selected_text_v3'])

score /= len(dataset_df)
score

0.9601642706359507