In [1]:
import json
import random

from collections import Counter

import pandas as pd
from sklearn.model_selection import train_test_split

from utils.data2seq import Dial2seq, SequencePreprocessor

## Merging TC and DD and splitting them into train, val and test

In [2]:
topical_sequencer = Dial2seq('data/topical_chat_annotated.json', 3)
daily_sequencer = Dial2seq('data/daily_dialogue_annotated.json', 3)

In [3]:
daily = daily_sequencer.transform()
topical = topical_sequencer.transform()

In [4]:
daily_preproc = SequencePreprocessor()

In [5]:
daily_dataset = daily_preproc.transform(daily)
len(daily_dataset)

29656

In [6]:
topical_preproc = SequencePreprocessor()
topical_dataset = topical_preproc.transform(topical)
len(topical_dataset)

84140

In [7]:
midas_dataset = list()
midas_entity_dataset = list()

for sample in daily_dataset:
    # if there is no annotated entity, add it to midas dataset
    if not sample['predict']['entity']['label']:
        midas_dataset.append(sample)
    else:
        midas_entity_dataset.append(sample)

for sample in topical_dataset:
    # if there is no annotated entity, add it to midas dataset
    if not sample['predict']['entity']['label']:
        midas_dataset.append(sample)
    else:
        midas_entity_dataset.append(sample)

In [8]:
len(midas_dataset), len(midas_entity_dataset)

(101327, 12469)

In [9]:
random.Random(42).shuffle(midas_entity_dataset)

In [10]:
train, val_test = train_test_split(midas_entity_dataset, test_size=0.2, random_state=42)
val, test = train_test_split(val_test, test_size=0.5, random_state=42)

In [62]:
%%bash
mkdir data/annotated

In [63]:
with open('data/annotated/train.json', 'w', encoding='utf-8') as f:
    json.dump(train, f, ensure_ascii=False, indent=2)
    
with open('data/annotated/val.json', 'w', encoding='utf-8') as f:
    json.dump(val, f, ensure_ascii=False, indent=2)
    
with open('data/annotated/test.json', 'w', encoding='utf-8') as f:
    json.dump(test, f, ensure_ascii=False, indent=2)

## Stats for train/val/test

In [11]:
target_train = [sample['predict'] for sample in train]
target_val = [sample['predict'] for sample in val]
target_test = [sample['predict'] for sample in test]

In [12]:
target_train = pd.json_normalize(target_train)
target_val = pd.json_normalize(target_val)
target_test = pd.json_normalize(target_test)

### Midas stats

In [13]:
target_train['midas'].value_counts()

opinion                  4048
statement                4013
yes_no_question           875
pos_answer                254
comment                   176
command                   168
open_question_factual     162
open_question_opinion     152
neg_answer                 82
complaint                  24
dev_command                10
appreciation                8
other_answers               3
Name: midas, dtype: int64

In [14]:
target_val['midas'].value_counts()

statement                503
opinion                  502
yes_no_question          106
open_question_opinion     27
pos_answer                26
command                   26
comment                   23
open_question_factual     18
neg_answer                 8
complaint                  6
dev_command                2
Name: midas, dtype: int64

In [15]:
target_test['midas'].value_counts()

statement                519
opinion                  501
yes_no_question           88
pos_answer                36
open_question_factual     30
comment                   23
open_question_opinion     21
command                   18
neg_answer                 5
complaint                  5
dev_command                1
Name: midas, dtype: int64

**validate that all midas labels from val and test are present in train**

In [16]:
set(target_val['midas'].value_counts().index.tolist()) -set(target_train['midas'].value_counts().index.tolist())

set()

In [17]:
set(target_test['midas'].value_counts().index.tolist()) - set(target_train['midas'].value_counts().index.tolist())

set()

### Entity stats

In [18]:
target_train['entity.label'].value_counts()

person                 2254
videoname              1517
location               1137
organization            825
device                  645
duration                634
genre                   541
sport                   508
number                  464
sportteam               370
softwareapplication     315
vehicle                 165
event                   143
position                130
date                    130
year                     88
gamename                 55
party                    40
bookname                 10
songname                  4
Name: entity.label, dtype: int64

In [19]:
target_val['entity.label'].value_counts()

person                 281
videoname              206
location               140
organization            98
device                  78
duration                77
genre                   73
sport                   62
number                  56
sportteam               54
softwareapplication     34
event                   24
vehicle                 17
position                15
date                    15
year                    10
gamename                 5
party                    2
Name: entity.label, dtype: int64

In [20]:
target_test['entity.label'].value_counts()

person                 295
videoname              176
location               145
organization            86
device                  84
genre                   77
duration                77
number                  61
sport                   61
sportteam               43
softwareapplication     42
vehicle                 25
event                   23
position                16
date                    15
year                     8
gamename                 7
party                    5
bookname                 1
Name: entity.label, dtype: int64

In [21]:
target_train['entity.label'].value_counts()

person                 2254
videoname              1517
location               1137
organization            825
device                  645
duration                634
genre                   541
sport                   508
number                  464
sportteam               370
softwareapplication     315
vehicle                 165
event                   143
position                130
date                    130
year                     88
gamename                 55
party                    40
bookname                 10
songname                  4
Name: entity.label, dtype: int64

**validate that all entities from val and test are present in train**

In [22]:
set(target_val['entity.label'].value_counts().index.tolist()) -set(target_train['entity.label'].value_counts().index.tolist())

set()

In [23]:
set(target_test['entity.label'].value_counts().index.tolist()) -set(target_train['entity.label'].value_counts().index.tolist())

set()

## Response banks

In [73]:
import hashlib

def set_guid(row):
    
    to_hash = " ".join([str(row.id), row.midas, str(row['entity.label']), row.text])
    
    return hashlib.sha1(to_hash.encode()).hexdigest()

In [91]:
target_midas_entity = target_train[['midas', 'entity.label', 'text']]
target_midas = [sample['predict'] for sample in midas_dataset]
target_midas = pd.json_normalize(target_midas)
target_midas = target_midas[['midas', 'entity.label', 'text']]
target_midas.loc[:, 'entity.label'] = None
targets = pd.concat([target_midas_entity, target_midas]).reset_index().rename(columns={'index': 'id'})

In [92]:
targets = targets.drop_duplicates(subset=['midas', 'entity.label', 'text'])

In [93]:
targets['entity.label'].value_counts(dropna=False)

NaN                    40561
person                  1892
videoname               1100
location                 996
organization             770
device                   602
duration                 523
sport                    446
genre                    443
number                   388
sportteam                310
softwareapplication      299
vehicle                  157
event                    135
position                 123
date                     116
year                      86
gamename                  52
party                     39
bookname                  10
songname                   4
Name: entity.label, dtype: int64

In [94]:
targets['midas'].value_counts()

opinion                  18229
statement                15099
comment                   4125
pos_answer                2449
yes_no_question           2168
open_question_factual     1737
neg_answer                1499
command                   1174
open_question_opinion      864
complaint                  664
appreciation               511
other_answers              436
dev_command                 97
Name: midas, dtype: int64

In [95]:
targets['guid'] = targets.apply(set_guid, axis=1)

In [96]:
targets

Unnamed: 0,id,midas,entity.label,text,guid
0,0,opinion,person,PERSON may not have toured much.,ae5f412086836cf0d98a36e502f4cf35852c93d4
1,1,opinion,person,PERSON.,dc635a8e6668aed692bd01b8ac29d8b074da768f
2,2,opinion,person,I guess it was PERSON.,dcb2a58d3f169f7d61abcbc7446a4813e2b957e6
3,3,opinion,location,It would be cool to listen to LOCATION to.,7c3e3e56bad8312b6284f7163b3a874c6bf9ea49
4,4,yes_no_question,year,can you imagine you would think they were used...,5d8871319e8254c5018c59d4927774e4054ccb2e
...,...,...,...,...,...
111288,101313,complaint,,That many!,e26e190b734c9548c68be441a214b4c996e99a51
111290,101315,statement,,Banned for that?,e5366dbe82288772def43fb2d7d567cffe7afb60
111292,101317,opinion,,That would make it more fun.,39655f016def309205909ade319ceb0e600a02cb
111296,101321,statement,,It must be.,b5e0e773925f319b1102c8131a212021d8ee437a


In [97]:
targets[['guid', 'midas', 'entity.label', 'text']].to_csv('data/targets.tsv', sep='\t', index=False)