In [1]:
import json
import re
from collections import Counter

import pandas as pd

In [2]:
with open('data/dataset.json', 'r', encoding="utf8") as f:
    data = json.load(f)

In [3]:
target_sents = [sample['predict'] for sample in data]

In [4]:
df = pd.json_normalize(target_sents)

In [5]:
df.head()

Unnamed: 0,text,midas,entity.text,entity.label
0,I like Google Chrome.,opinion,Google,softwareapplication
1,Was Inspect Gadget a comics on Paper or a cart...,yes_no_question,TV,device
2,Over 1.5 million living animal species have be...,statement,1.5 million,number
3,Someone else can fill Harrison Fords shoes?,statement,Harrison Fords,person
4,Does Chewbacca appear in this movie?,yes_no_question,Chewbacca,person


In [6]:
def normalize_text(row):
    pattern = "(?<![a-zA-z])" + row['entity.text'] + "(?![a-zA-Z])"
    return re.sub(pattern, row['entity.label'].upper(), row['text'])

In [7]:
df['normalized_text'] = df[['text', 'entity.text', 'entity.label']].apply(normalize_text, axis=1)

In [8]:
rare_midas = ['other_answers', 'appreciation', 'dev_command']
rare_entity = ['bookname', 'venue', 'ordinal']
df_common_midas = df[~df.midas.isin(rare_midas)]
df_rare_midas = df[df.midas.isin(rare_midas)]
df_common_entity = df[~df['entity.label'].isin(rare_entity)]

In [9]:
common_sampled_by_midas = df_common_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(10))
common_sampled_by_entity = df_common_entity.groupby('entity.label', group_keys=False).apply(lambda x: x.sample(10))
rare_sampled_by_midas = df_rare_midas.groupby('midas', group_keys=False).apply(lambda x: x.sample(1))

In [10]:
pd.concat([common_sampled_by_midas[['midas', 'normalized_text']], 
           rare_sampled_by_midas[['midas', 'normalized_text']]]).to_csv('data/midas_sample.tsv', sep='\t', index=False)

In [11]:
common_sampled_by_entity[['entity.label', 'normalized_text']].to_csv('data/entity_sample.tsv', sep='\t', index=False)

In [12]:
df['midas_entity'] = df[['midas', 'entity.label']].agg(' '.join, axis=1)

In [13]:
label_count = df.midas_entity.value_counts().rename_axis('label').reset_index(name='counts')

In [14]:
rare_midas_entity = label_count[label_count.counts <= 11].label.tolist()

In [15]:
df_common_midas_entity = df[~df.midas_entity.isin(rare_midas_entity)]
common_sampled_by_midas_and_entity = df_common_midas_entity.groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(10))

In [16]:
common_sampled_by_midas_and_entity[['midas_entity', 'normalized_text']].to_csv('data/midas_entity_sample.tsv', sep='\t', index=False)

In [17]:
df_rare_midas_entity = df[df.midas_entity.isin(rare_midas_entity)]
rare_sampled_by_midas_and_entity = df_rare_midas_entity.groupby('midas_entity', group_keys=False).apply(lambda x: x.sample(1))

In [18]:
rare_sampled_by_midas_and_entity[['midas_entity', 'normalized_text']].to_csv('data/midas_entity_rare_sampled.tsv', sep='\t', index=False)

In [19]:
df[['midas_entity', 'normalized_text']].groupby(
    'midas_entity', group_keys=False).apply(lambda x: x.sample(frac=1.0)).to_csv(
    'data/all_target_sentences.tsv', sep='\t', index=False)