In [2]:
from collections import Counter
from nltk import edit_distance
import pandas as pd
from sklearn.model_selection import train_test_split, ShuffleSplit, GroupShuffleSplit
import datasets
from datasets import Dataset, DatasetDict
from functools import reduce
import numpy as np

import json
import os 
import pandas as pd
from collections import defaultdict
import numpy as np



In [3]:
def load_split_new_scheme(splitname):
    filename = os.path.join("../OpenTable/mturk/dataset", f"dataset-2022-05-09-{splitname}.json")
    with open(filename) as f:
        data = json.load(f)
    for d in data:
        for k, v in d.items():
            if isinstance(v, dict):
                d[k] = json.dumps(v)
    return data  

In [4]:
train = pd.DataFrame(load_split_new_scheme("train"))
dev = pd.DataFrame(load_split_new_scheme("dev"))
test = pd.DataFrame(load_split_new_scheme("test"))

In [5]:
def process_df(
    df_in,
    aspect_label_encode={
        "Negative":0,
        "Positive":1,
        "unknown":2,
        "no majority": 2,
    },
    sequence_label_encode={
        "5": 1,
        "4": 1,
        "3": -1,
        "2": 0,
        "1": 0,
        "no majority": -1, # will be dropped!
    },
    
):
    df = df_in.copy()
    columns_to_keep = [
        'id', 'original_id', 'edit_id', 'is_original', 
        'description', 'review_majority',
        'food_aspect_majority', 'ambiance_aspect_majority', 
        'service_aspect_majority', 'noise_aspect_majority'
    ]
    columns_to_keep += [col for col in df.columns if 'prediction' in col]
    df = df[df["review_majority"]!="no majority"]
    df = df[columns_to_keep].rename(
        columns={
            'description': 'text', 
            'review_majority': 'label',
            'food_aspect_majority': 'food_label',
            'ambiance_aspect_majority': 'ambiance_label',
            'service_aspect_majority': 'service_label',
            'noise_aspect_majority': 'noise_label'
        }
    )
    df = df.replace("", -1).replace(
        {
            "label": sequence_label_encode, 
            "food_label": aspect_label_encode,
            "ambiance_label": aspect_label_encode,
            "service_label": aspect_label_encode,
            "noise_label": aspect_label_encode
        }
    )
    df = df[df["label"]!=-1]
    
    return df


In [6]:
post_train = Dataset.from_pandas(process_df(train))
post_dev = Dataset.from_pandas(process_df(dev))
post_test = Dataset.from_pandas(process_df(test))

In [7]:
opentable_seq_cls_dataset = DatasetDict()
opentable_seq_cls_dataset['train'] = post_train
opentable_seq_cls_dataset['validation'] = post_dev
opentable_seq_cls_dataset['test'] = post_test

In [8]:
opentable_seq_cls_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'original_id', 'edit_id', 'is_original', 'text', 'label', 'food_label', 'ambiance_label', 'service_label', 'noise_label', '__index_level_0__'],
        num_rows: 1072
    })
    validation: Dataset({
        features: ['id', 'original_id', 'edit_id', 'is_original', 'text', 'label', 'food_label', 'ambiance_label', 'service_label', 'noise_label', '__index_level_0__'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'original_id', 'edit_id', 'is_original', 'text', 'label', 'food_label', 'ambiance_label', 'service_label', 'noise_label', '__index_level_0__'],
        num_rows: 1228
    })
})

In [10]:
set(opentable_seq_cls_dataset["train"]["food_label"])

{-1, 0, 1, 2}

In [41]:
opentable_seq_cls_dataset.save_to_disk(f"./datasets/Proxy.CEBaB.sa.2-class.exclusive")