In [1]:
import json
import re
from collections import Counter

import pandas as pd

In [2]:
with open('data/daily_dataset.json', 'r', encoding="utf8") as f:
    daily = json.load(f)
    
with open('data/topical_dataset.json', 'r', encoding="utf8") as f:
    topical = json.load(f)

## Daily

In [47]:
target_sents = [sample['predict'] for sample in daily]

In [48]:
df = pd.json_normalize(target_sents)

In [49]:
df.head()

Unnamed: 0,text,midas,entity.label,entity.offsets,entity.text
0,"Hmm, 250NT is... NUMBER?!!",statement,number,"[17, 26]",8 dollars
1,Are they from LOCATION?,yes_no_question,location,"[14, 20]",taiwan
2,They're imported from LOCATION.,statement,location,"[22, 33]",new zealand
3,Some woman in here when I tried to register DU...,statement,duration,"[44, 55]",three weeks
4,It's been DURATION now.,statement,duration,"[10, 17]",3 years


In [52]:
df[['midas']].value_counts(normalize=True)

midas                
statement                1441
opinion                   415
yes_no_question           173
open_question_factual     104
open_question_opinion      82
command                    61
neg_answer                 25
pos_answer                 17
comment                    13
complaint                  11
dev_command                 9
other_answers               1
dtype: int64

In [53]:
df[['entity.label']].value_counts()

entity.label       
duration               545
location               348
number                 344
device                 231
person                 204
videoname              188
date                   129
vehicle                105
organization            74
sport                   61
genre                   60
event                   21
sportteam               16
year                    10
position                 9
gamename                 4
softwareapplication      2
songname                 1
dtype: int64

In [54]:
rare_midas = ['other_answers', 'dev_command']
rare_entity = ['songname', 'softwareapplication', 'gamename', 'position']
df_common_midas = df[~df.midas.isin(rare_midas)]
df_rare_midas = df[df.midas.isin(rare_midas)]
df_common_entity = df[~df['entity.label'].isin(rare_entity)]

In [55]:
common_sampled_by_midas = df_common_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(10))
common_sampled_by_entity = df_common_entity.groupby('entity.label', group_keys=False).apply(lambda x: x.sample(10))
rare_sampled_by_midas = df_rare_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(1))

In [56]:
pd.concat([common_sampled_by_midas[['midas', 'text']], 
           rare_sampled_by_midas[['midas', 'text']]]).to_csv('data/daily_midas_sample.tsv', sep='\t', index=False)

In [57]:
common_sampled_by_entity[['entity.label', 'text']].to_csv('data/daily_entity_sample.tsv', sep='\t', index=False)

In [58]:
df['midas_entity'] = df[['midas', 'entity.label']].agg(' '.join, axis=1)

In [60]:
label_count = df.midas_entity.value_counts().rename_axis('label').reset_index(name='counts')
label_count.head()

Unnamed: 0,label,counts
0,statement duration,447
1,statement number,289
2,statement location,205
3,statement person,109
4,statement device,102


In [61]:
rare_midas_entity = label_count[label_count.counts <= 11].label.tolist()

In [62]:
df_common_midas_entity = df[~df.midas_entity.isin(rare_midas_entity)]
common_sampled_by_midas_and_entity = df_common_midas_entity.groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(10))

In [63]:
common_sampled_by_midas_and_entity[['midas_entity', 'text']].to_csv('data/daily_midas_entity_sample.tsv', sep='\t', index=False)

In [64]:
df_rare_midas_entity = df[df.midas_entity.isin(rare_midas_entity)]
rare_sampled_by_midas_and_entity = df_rare_midas_entity.groupby('midas_entity', group_keys=False).apply(lambda x: x.sample(1))

In [65]:
rare_sampled_by_midas_and_entity[['midas_entity', 'text']].to_csv('data/daily_midas_entity_rare_sampled.tsv', sep='\t', index=False)

In [66]:
df[['midas_entity', 'text']].groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(frac=1.0)).to_csv(
    'data/daily_all_target_sentences.tsv', sep='\t', index=False)

## Topical

In [18]:
target_sents = [sample['predict'] for sample in topical]

In [19]:
df = pd.json_normalize(target_sents)

In [20]:
df.head()

Unnamed: 0,text,midas,entity.label,entity.offsets,entity.text
0,Yes and he even was in the production of VIDEO...,statement,videoname,"[41, 55]",the nutcracker
1,Yes like VIDEONAME.,pos_answer,videoname,"[9, 22]",good riddance
2,Mine all time favorite was VIDEONAME.,opinion,videoname,"[27, 41]",bozo the clown
3,A lot of people think the OT was so much bette...,opinion,videoname,"[57, 65]",prequels
4,and PERSON did not have any of them,statement,person,"[4, 14]",roosevelet


In [43]:
df[['midas']].value_counts(normalize=True)

midas                
opinion                  0.458239
statement                0.355244
yes_no_question          0.088564
pos_answer               0.029554
comment                  0.020658
command                  0.014925
open_question_opinion    0.011664
open_question_factual    0.010477
neg_answer               0.006919
complaint                0.002372
appreciation             0.000791
dev_command              0.000395
other_answers            0.000198
dtype: float64

In [44]:
df[['entity.label']].value_counts(normalize=True)

entity.label       
person                 0.259563
videoname              0.169121
location               0.106158
organization           0.092419
genre                  0.062370
device                 0.056934
sport                  0.056341
sportteam              0.044578
softwareapplication    0.038450
duration               0.024019
number                 0.023426
event                  0.016705
position               0.015024
vehicle                0.010082
year                   0.009489
gamename               0.006227
party                  0.004646
date                   0.003064
bookname               0.001087
songname               0.000297
dtype: float64

In [45]:
df.head()

Unnamed: 0,text,midas,entity.label,entity.offsets,entity.text,midas_entity
0,Yes and he even was in the production of VIDEO...,statement,videoname,"[41, 55]",the nutcracker,statement videoname
1,Yes like VIDEONAME.,pos_answer,videoname,"[9, 22]",good riddance,pos_answer videoname
2,Mine all time favorite was VIDEONAME.,opinion,videoname,"[27, 41]",bozo the clown,opinion videoname
3,A lot of people think the OT was so much bette...,opinion,videoname,"[57, 65]",prequels,opinion videoname
4,and PERSON did not have any of them,statement,person,"[4, 14]",roosevelet,statement person


In [29]:
rare_midas = ['other_answers', 'appreciation', 'dev_command']
rare_entity = ['songname']
df_common_midas = df[~df.midas.isin(rare_midas)]
df_rare_midas = df[df.midas.isin(rare_midas)]
df_common_entity = df[~df['entity.label'].isin(rare_entity)]

In [30]:
common_sampled_by_midas = df_common_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(10))
common_sampled_by_entity = df_common_entity.groupby('entity.label', group_keys=False).apply(lambda x: x.sample(10))
rare_sampled_by_midas = df_rare_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(1))

In [31]:
pd.concat([common_sampled_by_midas[['midas', 'text']], 
           rare_sampled_by_midas[['midas', 'text']]]).to_csv('data/topical_midas_sample.tsv', sep='\t', index=False)

In [33]:
common_sampled_by_entity[['entity.label', 'text']].to_csv('data/topical_entity_sample.tsv', sep='\t', index=False)

In [34]:
df['midas_entity'] = df[['midas', 'entity.label']].agg(' '.join, axis=1)

In [46]:
label_count = df.midas_entity.value_counts().rename_axis('label').reset_index(name='counts')
label_count.head()

Unnamed: 0,label,counts
0,opinion person,1334
1,opinion videoname,815
2,statement person,809
3,statement videoname,568
4,statement location,497


In [36]:
rare_midas_entity = label_count[label_count.counts <= 11].label.tolist()

In [37]:
df_common_midas_entity = df[~df.midas_entity.isin(rare_midas_entity)]
common_sampled_by_midas_and_entity = df_common_midas_entity.groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(10))

In [38]:
common_sampled_by_midas_and_entity[['midas_entity', 'text']].to_csv('data/topical_midas_entity_sample.tsv', sep='\t', index=False)

In [39]:
df_rare_midas_entity = df[df.midas_entity.isin(rare_midas_entity)]
rare_sampled_by_midas_and_entity = df_rare_midas_entity.groupby('midas_entity', group_keys=False).apply(lambda x: x.sample(1))

In [40]:
rare_sampled_by_midas_and_entity[['midas_entity', 'text']].to_csv('data/topical_midas_entity_rare_sampled.tsv', sep='\t', index=False)

In [42]:
df[['midas_entity', 'text']].groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(frac=1.0)).to_csv(
    'data/topical_all_target_sentences.tsv', sep='\t', index=False)