In [1]:
import pandas as pd
import sqlite3
import sqlalchemy 
import json
from datasets import load_dataset
from collections import Counter
from pydantic.dataclasses import dataclass
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DB_PATH = "./subset_selection/sst_results.db"
with sqlite3.connect(DB_PATH) as conn: 
    #Now in order to read in pandas dataframe we need to know table name
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_name = cursor.fetchall()[0][0]
    print(f"Table Name : {table_name}")

    df = pd.read_sql_query(f'SELECT * FROM {table_name}', conn)

Table Name : states


In [3]:
df

Unnamed: 0,indexes,objective
0,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.000000
1,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.690498
2,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.697738
3,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.690950
4,"[787, 882, 40, 341, 477, 454, 227, 334, 837, 2...",0.709502
...,...,...
8566,"[517, 882, 82, 341, 477, 381, 611, 377, 837, 2...",0.747964
8567,"[48, 882, 82, 341, 477, 381, 611, 377, 837, 21...",0.752941
8568,"[517, 882, 82, 341, 477, 381, 611, 377, 837, 2...",0.742081
8569,"[517, 882, 82, 341, 477, 381, 611, 377, 837, 2...",0.751131


In [4]:
optimal_subset_idx = df['objective'].idxmax()

optimal_subset_data_indices = set(json.loads(df.iloc[optimal_subset_idx].indexes))

In [5]:
# count the unique number of data points in each subset
indexes_list = list(map(json.loads, df["indexes"].to_list()))
indexes_list = [item for sublist in indexes_list for item in sublist]
indexes_set = set(indexes_list)
indexes = list(map(json.loads, df["indexes"].to_list()))
unique_index_counts = list(map(lambda x: len(set(x)), indexes))
subset_sizes = Counter(unique_index_counts)

In [6]:
sst2 = load_dataset("sst")
data_pool = sst2["train"].shuffle(seed=0).select(range(1000))

No config specified, defaulting to: sst/default
Reusing dataset sst (/Users/garylai/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
100%|██████████| 3/3 [00:00<00:00, 309.03it/s]
Loading cached shuffled indices for dataset at /Users/garylai/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-db8800a2b70639e2.arrow


In [26]:
@dataclass(frozen=True)
class OptimalSubsetClassifierConfig:
    max_length: int = 66
    debug: bool = False
    model_name: str = "google/electra-small-discriminator"
    batch_size: int = 8

config = OptimalSubsetClassifierConfig()
tokenizer = AutoTokenizer.from_pretrained(config.model_name)



In [32]:
def preprocess(data):
    def tokenize_func(examples, idx):
            tokenized = tokenizer(
                examples["sentence"], padding="max_length", max_length=config.max_length, truncation=True
            )
            tokenized["labels"] = 1 if idx in optimal_subset_data_indices else 0
            return tokenized


    ds = data.map(
        tokenize_func,
        remove_columns=data.column_names,
        batched=False,
        with_indices=True
    )

    ds.set_format(type="torch")



    return ds

In [33]:
train_ds = preprocess(data_pool)

100%|██████████| 1000/1000 [00:00<00:00, 1613.53ex/s]


In [36]:
assert int(train_ds["labels"][3]) == 1

In [37]:
# test the labels are correct
for idx, label in enumerate(train_ds["labels"]):
    if idx in optimal_subset_data_indices:
        assert int(label) == 1
    else:
        assert int(label) == 0

In [None]:
# preprocess the data in data pool

In [None]:
# train binary classifier 