In [1]:
import datasets
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
task_text_field_map = {
     'cola': ['sentence'],
     'sst2': ['sentence'],
     'mrpc': ['sentence1', 'sentence2'],
     'qqp': ['question1', 'question2'],
     'stsb': ['sentence1', 'sentence2'],
     'mnli': ['premise', 'hypothesis'],
     'qnli': ['question', 'sentence'],
     'rte': ['sentence1', 'sentence2'],
     'wnli': ['sentence1', 'sentence2']}

glue_task_num_labels = {
     'cola': 2, 'sst2': 2,
     'mrpc': 2, 'qqp': 2,
     'stsb': 1, 'mnli': 3,
     'qnli': 2, 'rte': 2,
     'wnli': 2}

loader_columns = [
     'input_ids',
     'token_type_ids',
     'attention_mask',
     'label']

In [None]:
dataframes = dict()
for name in list(task_text_field_map.keys()):
    dataframes[name] = datasets.load_dataset('glue', name)

In [5]:
dataframes

{'cola': DatasetDict({
     train: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 8551
     })
     validation: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 1043
     })
     test: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 1063
     })
 }),
 'sst2': DatasetDict({
     train: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 67349
     })
     validation: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 872
     })
     test: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 1821
     })
 }),
 'mrpc': DatasetDict({
     train: Dataset({
         features: ['sentence1', 'sentence2', 'label', 'idx'],
         num_rows: 3668
     })
     validation: Dataset({
         features: ['sentence1', 'sentence2', 'label', 'idx'],
         num_rows: 408
     })
     test: Dataset({
         features: ['sentence1', 'se

In [6]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.39MB/s]


In [8]:
def convert_to_features(example_batch):
    if len(text_fields) > 1:
        texts_or_text_pairs = list(zip(example_batch[text_fields[0]], 
                                               example_batch[text_fields[1]]))
    else:
        texts_or_text_pairs = example_batch[text_fields[0]]
                
    features = tokenizer.batch_encode_plus(texts_or_text_pairs, add_special_tokens=True) 
            
    features['label'] = example_batch['label']
    return features

In [9]:
for name in dataframes.keys():
    text_fields = task_text_field_map[name]
    #num_labels = glue_task_num_labels[name]
    dataframe = dataframes[name]
    for split in dataframe.keys():
        dataframe[split] = dataframe[split].map(convert_to_features, batched=True)
        columns = [c for c in dataframe[split].column_names if c in loader_columns]
        dataframe[split].set_format(type="torch", columns=columns)
    dataframes[name] = dataframe

Map:   2%|▏         | 2000/104743 [00:00<00:11, 8894.38 examples/s]   Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors
                                                                     

In [22]:
for name in dataframes.keys():
    try:
        text_fields = task_text_field_map[name] + ['idx']
        dataframes[name] = dataframes[name].remove_columns(text_fields)
    except:
        print('already done.')

already done.
already done.
already done.
already done.
already done.
already done.
already done.
already done.
already done.


In [21]:
dataframes

{'cola': DatasetDict({
     train: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 8551
     })
     validation: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 1043
     })
     test: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 1063
     })
 }),
 'sst2': DatasetDict({
     train: Dataset({
         features: ['label', 'idx', 'input_ids', 'attention_mask'],
         num_rows: 67349
     })
     validation: Dataset({
         features: ['label', 'idx', 'input_ids', 'attention_mask'],
         num_rows: 872
     })
     test: Dataset({
         features: ['label', 'idx', 'input_ids', 'attention_mask'],
         num_rows: 1821
     })
 }),
 'mrpc': DatasetDict({
     train: Dataset({
         features: ['label', 'idx', 'input_ids', 'attention_mask'],
         num_rows: 3668
     })
     validation: Dataset({
         features: ['label', 'idx', 'input_ids', 'att

In [33]:
import pandas

def summary_data(dataframe):
    def process(sub_df):
        temp_dict = dict()
        temp_dict['label'] = list(sub_df['label'])
        temp_dict['sent_length'] = [len(x) for x in sub_df['input_ids']]
        return pandas.DataFrame(temp_dict)
    temp_dict = [process(dataframe[name]) for name in dataframe.keys()]
    return temp_dict

In [34]:
df_list = [summary_data(dataframes[name]) for name in dataframes.keys()]

In [57]:
def analyze_data(df):
    df['label'] = [int(x) for x in list(df['label'])] 
    print(int(df.describe()['sent_length']['max']))

In [58]:
for tdx, name in enumerate(dataframes.keys()):
    print(f'{name} data summary')
    for idx, type in enumerate(['train','validation','test']):
        print(f'{type} set')
        analyze_data(df_list[tdx][idx])
    print('================')

cola data summary
train set
47
validation set
36
test set
34
sst2 data summary
train set
67
validation set
63
test set
63
mrpc data summary
train set
104
validation set
84
test set
99
qqp data summary
train set
317
validation set
191
test set
471
stsb data summary
train set
125
validation set
91
test set
82
mnli data summary
train set
425
validation set
229
test set
220
qnli data summary
train set
566
validation set
259
test set
309
rte data summary
train set
292
validation set
248
test set
248
wnli data summary
train set
109
validation set
107
test set
99


In [64]:
for sub_df in df_list[6]:
    #print(sub_df.describe())
    print(sub_df[sub_df['sent_length'] > 512])
    print(f'total df length : {len(sub_df)}')
    break

       label  sent_length
2711       0          565
82398      0          566
82549      0          565
84277      0          564
88427      0          564
total df length : 104743
