In [1]:
import pandas as pd
import sqlite3
import sqlalchemy 
import json
from datasets import load_dataset
from collections import Counter
from pydantic.dataclasses import dataclass
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DB_PATH = "./subset_selection/sst_results.db"
with sqlite3.connect(DB_PATH) as conn: 
    #Now in order to read in pandas dataframe we need to know table name
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_name = cursor.fetchall()[0][0]
    print(f"Table Name : {table_name}")

    df = pd.read_sql_query(f'SELECT * FROM {table_name}', conn)

Table Name : states


In [3]:
df

Unnamed: 0,indexes,objective
0,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.000000
1,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.690498
2,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.697738
3,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.690950
4,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.709502
...,...,...
8566,"[517, 882, 82, 341, 477, 381, 611, 377, 837, 2...",0.747964
8567,"[48, 882, 82, 341, 477, 381, 611, 377, 837, 21...",0.752941
8568,"[517, 882, 82, 341, 477, 381, 611, 377, 837, 2...",0.742081
8569,"[517, 882, 82, 341, 477, 381, 611, 377, 837, 2...",0.751131


In [4]:
optimal_subset_idx = df['objective'].idxmax()

optimal_subset_data_indices = set(json.loads(df.iloc[optimal_subset_idx].indexes))

In [5]:
# count the unique number of data points in each subset
indexes_list = list(map(json.loads, df["indexes"].to_list()))
indexes_list = [item for sublist in indexes_list for item in sublist]
indexes_set = set(indexes_list)
indexes = list(map(json.loads, df["indexes"].to_list()))
unique_index_counts = list(map(lambda x: len(set(x)), indexes))
subset_sizes = Counter(unique_index_counts)

In [6]:
sst2 = load_dataset("sst")
data_pool = sst2["train"].shuffle(seed=0).select(range(1000))

No config specified, defaulting to: sst/default
Reusing dataset sst (/Users/garylai/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
100%|██████████| 3/3 [00:00<00:00, 309.03it/s]
Loading cached shuffled indices for dataset at /Users/garylai/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-db8800a2b70639e2.arrow


In [26]:
@dataclass(frozen=True)
class OptimalSubsetClassifierConfig:
    max_length: int = 66
    debug: bool = False
    model_name: str = "google/electra-small-discriminator"
    batch_size: int = 8

config = OptimalSubsetClassifierConfig()
tokenizer = AutoTokenizer.from_pretrained(config.model_name)



In [32]:
def preprocess(data):
    def tokenize_func(examples, idx):
            tokenized = tokenizer(
                examples["sentence"], padding="max_length", max_length=config.max_length, truncation=True
            )
            tokenized["labels"] = 1 if idx in optimal_subset_data_indices else 0
            return tokenized


    ds = data.map(
        tokenize_func,
        remove_columns=data.column_names,
        batched=False,
        with_indices=True
    )

    ds.set_format(type="torch")



    return ds

In [44]:
ds = preprocess(data_pool)

100%|██████████| 1000/1000 [00:00<00:00, 2016.21ex/s]


In [45]:
assert int(ds["labels"][3]) == 1

In [46]:
# test the labels are correct
for idx, label in enumerate(ds["labels"]):
    if idx in optimal_subset_data_indices:
        assert int(label) == 1
    else:
        assert int(label) == 0

In [47]:
ds[3]

{'input_ids': tensor([  101,  2023,  2003,  1996,  2785,  1997,  3395,  3043,  2008,  2071,
          2061,  4089,  2031,  2042, 20054,  2011,  1037,  8276, 12127,  1010,
          2021,  1037, 16363,  2015,  3084,  1996,  2157,  9804,  2012,  2296,
          2735,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0,

In [53]:
neg_to_pos_ratio = (len(data_pool) - len(optimal_subset_data_indices)) // len(optimal_subset_data_indices)
neg_to_pos_ratio

9

In [55]:
type(optimal_subset_data_indices), type(data_pool)

(set, datasets.arrow_dataset.Dataset)

In [56]:
data_pool_indices = set(range(1000))

In [57]:
non_optimal_subset_data_indices = data_pool_indices - optimal_subset_data_indices

In [59]:
len(non_optimal_subset_data_indices)

904

In [61]:
optimal_subset_data_indices_ls = list(optimal_subset_data_indices)
non_optimal_subset_data_indices_ls = list(non_optimal_subset_data_indices)


# get the positive examples
split_points = (round(len(optimal_subset_data_indices_ls) * 0.8), round(len(optimal_subset_data_indices_ls) * 0.9))
print(f"Split points positive: {split_points}")

train_pos_indices = optimal_subset_data_indices_ls[:split_points[0]]
valid_pos_indices = optimal_subset_data_indices_ls[split_points[0]:split_points[1]]
test_pos_indices = optimal_subset_data_indices_ls[split_points[1]:]

# get the negative examples
split_points = (round(len(non_optimal_subset_data_indices_ls) * 0.8), round(len(non_optimal_subset_data_indices_ls) * 0.9))
print(f"Split points negative: {split_points}")

train_neg_indices = non_optimal_subset_data_indices_ls[:split_points[0]]
valid_neg_indices = non_optimal_subset_data_indices_ls[split_points[0]:split_points[1]]
test_neg_indices = non_optimal_subset_data_indices_ls[split_points[1]:]

# combine
train_indices = train_pos_indices + train_neg_indices
valid_indices = valid_pos_indices + valid_neg_indices
test_indices = test_pos_indices + test_neg_indices

Split points positive: (77, 86)
Split points negative: (723, 814)


In [62]:
len(train_indices), len(valid_indices), len(test_indices)


(800, 100, 100)

In [66]:
# check there are no overlaps between the three datasets
assert len(set(train_indices+valid_indices+test_indices)) == len(train_indices+valid_indices+test_indices)

In [63]:
train_ds = ds.select(train_indices)
valid_ds = ds.select(valid_indices)
test_ds = ds.select(test_indices)

In [64]:
len(train_ds), len(valid_ds), len(test_ds)

(800, 100, 100)

In [65]:
type(train_ds), type(valid_ds), type(test_ds)

(datasets.arrow_dataset.Dataset,
 datasets.arrow_dataset.Dataset,
 datasets.arrow_dataset.Dataset)

In [48]:
# 90% train, 10% test + validation
train_testvalid = ds.train_test_split(test_size=0.2)

# # Split the 10% test + valid in half test, half valid
# test_valid = train_test_dataset['test'].train_test_split(test=0.5)
# # gather everyone if you want to have a single DatasetDict
# train_test_valid_dataset = DatasetDict({
#     'train': train_testvalid['train'],
#     'test': test_valid['test'],
#     'valid': test_valid['train']})

TypeError: train_test_split() got an unexpected keyword argument 'test'

In [None]:
# train binary classifier 