In [1]:
import json
from collections import Counter

import pandas as pd

In [2]:
with open('data/dataset.json', 'r') as f:
    data = json.load(f)

In [3]:
target_sents = [sample['predict'] for sample in data]

In [4]:
df = pd.json_normalize(target_sents)

In [5]:
df.head()

Unnamed: 0,text,midas,entity.text,entity.label
0,"Yes like good riddance, in my heart of hearts ...",opinion,good riddance,videoname
1,"I used to in my childhood but not any more, I ...",opinion,inspector Gadget,videoname
2,"Yes, 20 quintillion, can you believe that?Wow",yes_no_question,20 quintillion,number
3,"Yes indeed, interesting facts that will give m...",opinion,weekend,date
4,hmm that seems dumb like its just playing with...,yes_no_question,Teddy Roosevelt,person


In [6]:
def normalize_text(row):
    return row['text'].replace(row['entity.text'], row['entity.label'].upper())

In [7]:
df['normalized_text'] = df[['text', 'entity.text', 'entity.label']].apply(normalize_text, axis=1)

In [48]:
rare_midas = ['other_answers', 'appreciation', 'dev_command']
rare_entity = ['bookname', 'venue', 'ordinal']
df_common_midas = df[~df.midas.isin(rare_midas)]
df_rare_midas = df[df.midas.isin(rare_midas)]
df_common_entity = df[~df['entity.label'].isin(rare_entity)]

In [53]:
common_sampled_by_midas = df_common_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(10))
common_sampled_by_entity = df_common_entity.groupby('entity.label', group_keys=False).apply(lambda x: x.sample(10))
rare_sampled_by_midas = df_rare_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(4))

In [63]:
pd.concat([common_sampled_by_midas[['midas', 'normalized_text']], 
           rare_sampled_by_midas[['midas', 'normalized_text']]]).to_csv('data/midas_sample.tsv', sep='\t', index=False)

In [64]:
common_sampled_by_entity[['entity.label', 'normalized_text']].to_csv('data/entity_sample.tsv', sep='\t', index=False)

In [8]:
df.head()

Unnamed: 0,text,midas,entity.text,entity.label,normalized_text
0,"Yes like good riddance, in my heart of hearts ...",opinion,good riddance,videoname,"Yes like VIDEONAME, in my heart of hearts and ..."
1,"I used to in my childhood but not any more, I ...",opinion,inspector Gadget,videoname,"I used to in my childhood but not any more, I ..."
2,"Yes, 20 quintillion, can you believe that?Wow",yes_no_question,20 quintillion,number,"Yes, NUMBER, can you believe that?Wow"
3,"Yes indeed, interesting facts that will give m...",opinion,weekend,date,"Yes indeed, interesting facts that will give m..."
4,hmm that seems dumb like its just playing with...,yes_no_question,Teddy Roosevelt,person,hmm that seems dumb like its just playing with...


In [9]:
df['midas_entity'] = df[['midas', 'entity.label']].agg(' '.join, axis=1)

In [32]:
label_count = df.midas_entity.value_counts().rename_axis('label').reset_index(name='counts')

In [37]:
rare_midas_entity = label_count[label_count.counts < 15].label.tolist()

In [43]:
df_common_midas_entity = df[~df.midas_entity.isin(rare_midas_entity)]
common_sampled_by_midas_and_entity = df_common_midas_entity.groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(10))

In [48]:
common_sampled_by_midas_and_entity[['midas_entity', 'normalized_text']].to_csv('data/midas_entity_sample.tsv', sep='\t', index=False)

In [60]:
df_rare_midas_entity = df[df.midas_entity.isin(rare_midas_entity)]
rare_sampled_by_midas_and_entity = df_rare_midas_entity.groupby('midas_entity', group_keys=False).apply(lambda x: x.sample(1))

In [62]:
rare_sampled_by_midas_and_entity[['midas_entity', 'normalized_text']].to_csv('data/midas_entity_rare_sampled.tsv', sep='\t', index=False)