In [1]:
import json
import pandas as pd

## Creating original dataset files from data_full.json

In [2]:
with open('../original_repository_files/data/data_full.json') as json_data:
    data = json.load(json_data)

In [3]:
df_train_data = pd.DataFrame(data['train'] + data['oos_train'])
df_train_data.columns = ['query', 'label']
df_train_data = df_train_data.sample(frac=1).reset_index(drop=True)
df_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15100 entries, 0 to 15099
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   15100 non-null  object
 1   label   15100 non-null  object
dtypes: object(2)
memory usage: 236.1+ KB


In [5]:
df_val_data = pd.DataFrame(data['val'] + data['oos_val'])
df_val_data.columns = ['query', 'label']
df_val_data = df_val_data.sample(frac=1).reset_index(drop=True)
df_val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3100 entries, 0 to 3099
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   3100 non-null   object
 1   label   3100 non-null   object
dtypes: object(2)
memory usage: 48.6+ KB


In [7]:
df_test_data = pd.DataFrame(data['test'] + data['oos_test'])
df_test_data.columns = ['query', 'label']
df_test_data = df_test_data.sample(frac=1).reset_index(drop=True)
df_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5500 entries, 0 to 5499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   5500 non-null   object
 1   label   5500 non-null   object
dtypes: object(2)
memory usage: 86.1+ KB


In [8]:
df_train_data.to_csv("original_dataset/original_train_data.csv", index = False)
df_val_data.to_csv("original_dataset/original_val_data.csv", index = False)
df_test_data.to_csv("original_dataset/original_test_data.csv", index = False)

## Creating augmented dataset files with generated queries

In [9]:
og_train_data = pd.read_csv('original_dataset/original_train_data.csv')
og_val_data = pd.read_csv('original_dataset/original_val_data.csv')
og_test_data = pd.read_csv('original_dataset/original_test_data.csv')

In [12]:
generated_data = pd.read_excel('generated_queries/generated_data.xlsx', sheet_name='generated_data_v2')
generated_data['query'] = generated_data['query'].str.lower()
generated_data.head()

Unnamed: 0,query,label
0,can i make a reservation at the red lobster ne...,accept_reservations
1,is it possible to book a table at olive garden...,accept_reservations
2,i want to know if the cheesecake factory takes...,accept_reservations
3,does applebee's accept reservations?,accept_reservations
4,can i reserve a table at outback steakhouse?,accept_reservations


In [13]:
with open('../original_repository_files/data/data_oos_plus.json') as json_data:
    oos_data = json.load(json_data)

additional_oos_data = pd.DataFrame(oos_data['oos_train'])
additional_oos_data.columns = ['query', 'label']

# Randomly select 100 rows from the dataframe
random_rows = additional_oos_data.sample(n=100, random_state=42)

# Reduce the dataframe to the 100 random rows
additional_oos_data = additional_oos_data.loc[random_rows.index]

In [14]:
augmented_test_data = pd.concat([og_train_data, generated_data, additional_oos_data], axis=0)
augmented_test_data = augmented_test_data.sample(frac=1).reset_index(drop=True)
augmented_test_data.head()

Unnamed: 0,query,label
0,can you check my reservations for mortons unde...,confirm_reservation
1,find me a place to stay in cali november 11 to 15,book_hotel
2,i need you to increase the speaker volume,change_volume
3,can you inform me of the time?,time
4,can i set up direct deposit for my fellowship ...,direct_deposit


In [15]:
augmented_test_data.to_csv("augmented_dataset/augmented_train_data.csv", index = False)