In [2]:
from collections import Counter
from nltk import edit_distance
import pandas as pd
from sklearn.model_selection import train_test_split, ShuffleSplit, GroupShuffleSplit
import datasets
from datasets import Dataset, DatasetDict
from functools import reduce
import numpy as np

In [8]:
train_df = pd.read_json("../OpenTable/mturk/dataset/dataset-2022-02-24-train.json")
dev_df = pd.read_json("../OpenTable/mturk/dataset/dataset-2022-02-24-dev.json")
test_df = pd.read_json("../OpenTable/mturk/dataset/dataset-2022-02-24-test.json")

In [11]:
def get_absa_labels_df(
    df,
    aspect_label_encode={
        "Negative":0,
        "Positive":1,
        "unknown":2,
        "no majority": 2,
    },
    sequence_label_encode={
        "5": 4,
        "4": 3,
        "3": 2,
        "2": 1,
        "1": 0,
        "no majority": -1, # will be dropped!
    },
):
    train_df = df.copy()
    train_df['group_id'] = train_df['original_description'].apply(hash)
    train_df['group_id'] = train_df['group_id'].rank(method='dense', ascending=False).astype(int)
    
    original_train_df = train_df[
        [
            'group_id',
            'original_description', 
            'type', 
            'original_majority', 
            'original_review_majority' 
        ]
    ].rename(
        columns={
            'original_description': 'text', 
            'type': 'aspect', 
            'original_majority': 'aspect_label',
            'original_review_majority': 'sentence_label'
        }
    )
    original_train_df["original"] = True
    
    edit_train_df = train_df[
        [
            'group_id',
            'edit_description', 
            'type', 
            'edit_majority', 
            'edit_review_majority' 
        ]
    ].rename(
        columns={
            'edit_description': 'text', 
            'type': 'aspect', 
            'edit_majority': 'aspect_label',
            'edit_review_majority': 'sentence_label'
        }
    )
    edit_train_df["original"] = False

    original_train_df = original_train_df.replace(
        {
            "aspect_label": aspect_label_encode, 
            "sentence_label": sequence_label_encode
        }
    ).drop_duplicates()
    edit_train_df = edit_train_df.replace(
        {
            "aspect_label": aspect_label_encode, 
            "sentence_label": sequence_label_encode
        }
    ).drop_duplicates()
    
    original_train_df = original_train_df.reset_index(drop=True)
    edit_train_df = edit_train_df.reset_index(drop=True)
    
    pivot_original_train_df = pd.pivot_table(
        original_train_df,
        values='aspect_label',
        columns='aspect',
        index=['text', 'sentence_label', 'group_id', 'original'],
    )
    pivot_original_train_df.reset_index(inplace=True)
    pivot_original_train_df.columns = [
        "text", "label", "group_id", "original",
        "ambiance_label", "food_label", "noise_label", "service_label"
    ]

    pivot_edit_train_df = pd.pivot_table(
        edit_train_df,
        values='aspect_label',
        columns='aspect',
        index=['text', 'sentence_label', 'group_id', 'original'],
    )
    pivot_edit_train_df.reset_index(inplace=True)
    pivot_edit_train_df.columns = [
        "text", "label", "group_id", "original",
        "ambiance_label", "food_label", "noise_label", "service_label"
    ]
    
#     for aspect_label in ["ambiance_label", "food_label", "noise_label", "service_label"]:
#         pivot_edit_train_df[aspect_label] = np.where(
#             pivot_edit_train_df[aspect_label].isnull(),
#             pivot_edit_train_df['group_id'].map(
#                 pivot_original_train_df.set_index('group_id')[aspect_label]
#             ),
#             pivot_edit_train_df[aspect_label]
#         )
#         pivot_edit_train_df[aspect_label] = pivot_edit_train_df[aspect_label].astype(int)
    
    pivot_train_df = pd.concat((pivot_original_train_df, pivot_edit_train_df))
    pivot_train_df = pivot_train_df[(pivot_train_df["label"]!=-1)].drop_duplicates(subset='text')
    pivot_train_df.reset_index(drop=True, inplace=True)
    pivot_train_df = pivot_train_df.fillna(-1)
    for aspect_label in ["ambiance_label", "food_label", "noise_label", "service_label"]:
        pivot_train_df[aspect_label] = pivot_train_df[aspect_label].astype(int)
    
    ood_train_df = []
    for aspect_label in ["ambiance_label", "food_label", "noise_label", "service_label"]:

        ood_0 = pivot_train_df[
            (pivot_train_df[aspect_label]==1) &
            (pivot_train_df["label"]==0)
        ]

        ood_1 = pivot_train_df[
            (pivot_train_df[aspect_label]==1) &
            (pivot_train_df["label"]==1)
        ]

        ood_3 = pivot_train_df[
            (pivot_train_df[aspect_label]==0) &
            (pivot_train_df["label"]==3)
        ]

        ood_4 = pivot_train_df[
            (pivot_train_df[aspect_label]==0) &
            (pivot_train_df["label"]==4)
        ]

        ood_train_df += [pd.concat(
            (ood_0, ood_1, ood_3, ood_4)
        )]
    ood_train_df = pd.concat(ood_train_df).drop_duplicates()
    
    iid_train_df = pivot_train_df.iloc[
        pivot_train_df.index.difference(
            ood_train_df.index
        )
    ]
    
    return iid_train_df, ood_train_df, pivot_train_df

In [12]:
k=0
seed=7
iid_train_df, ood_train_df, all_train_df = get_absa_labels_df(train_df)
if k > 0:
    if k >= len(ood_train_df):
        k = len(ood_train_df)
        k_shots_train = ood_train_df
    else:
        _, k_shots_train = train_test_split(
            ood_train_df, test_size=k, random_state=seed
        )
    iid_train_df = pd.concat((
        iid_train_df, 
        k_shots_train, 
    ))

iid_dev_df, ood_dev_df, all_dev_df = get_absa_labels_df(
    dev_df,
    sequence_label_encode={
        5: 4,
        4: 3,
        3: 2,
        2: 1,
        1: 0,
    }
)
iid_test_df, ood_test_df, all_test_df = get_absa_labels_df(
    test_df,
    sequence_label_encode={
        5: 4,
        4: 3,
        3: 2,
        2: 1,
        1: 0,
    }
)

In [13]:
iid_train_df = Dataset.from_pandas(iid_train_df)

all_dev_df = Dataset.from_pandas(all_dev_df)
ood_dev_df = Dataset.from_pandas(ood_dev_df)
iid_dev_df = Dataset.from_pandas(iid_dev_df)

all_test_df = Dataset.from_pandas(all_test_df)
ood_test_df = Dataset.from_pandas(ood_test_df)
iid_test_df = Dataset.from_pandas(iid_test_df)

opentable_seq_cls_dataset = DatasetDict()
opentable_seq_cls_dataset['train'] = iid_train_df

opentable_seq_cls_dataset['validation'] = all_dev_df
opentable_seq_cls_dataset['validation_ood'] = ood_dev_df
opentable_seq_cls_dataset['validation_iid'] = iid_dev_df

opentable_seq_cls_dataset['test'] = all_test_df
opentable_seq_cls_dataset['test_ood'] = ood_test_df
opentable_seq_cls_dataset['test_iid'] = iid_test_df

opentable_seq_cls_dataset.save_to_disk(f"./datasets/sequence_classification_ood.k_{k}.seed_{seed}/")

In [14]:
opentable_seq_cls_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'group_id', 'original', 'ambiance_label', 'food_label', 'noise_label', 'service_label', '__index_level_0__'],
        num_rows: 8469
    })
    validation: Dataset({
        features: ['text', 'label', 'group_id', 'original', 'ambiance_label', 'food_label', 'noise_label', 'service_label'],
        num_rows: 1660
    })
    validation_ood: Dataset({
        features: ['text', 'label', 'group_id', 'original', 'ambiance_label', 'food_label', 'noise_label', 'service_label', '__index_level_0__'],
        num_rows: 222
    })
    validation_iid: Dataset({
        features: ['text', 'label', 'group_id', 'original', 'ambiance_label', 'food_label', 'noise_label', 'service_label', '__index_level_0__'],
        num_rows: 1438
    })
    test: Dataset({
        features: ['text', 'label', 'group_id', 'original', 'ambiance_label', 'food_label', 'noise_label', 'service_label'],
        num_rows: 1678
    })
    test_ood: Dataset(