# Creating `dataset.Dataset` (From huggingface) 

In [9]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from datasets import Dataset       # About dataset: https://huggingface.co/docs/datasets/master/en/package_reference/main_classes#datasets.Dataset

#### Model inputs
https://huggingface.co/transformers/v3.2.0/glossary.html#:~:text=Token%20Type%20IDs,-Some%20models'%20purpose&text=They%20are%20represented%20as%20a,%2C%201%2C%201%2C%201%5D

1. **Input IDs** - tokens IDs
2. **Token type IDs** - (for QA and classification only - keep it all 0) 
3. **Attention mask** - whether the token needs attention to 
    - **[PAD] with attention mask = 0**

### Convert data into `dataset.Dataset` format

In [2]:
def creatingDatasetPd(tokenDataset):
    
    # 1. input_ids
    input_ids = tokenDataset.values.tolist()
    
    # 2. make token_type_ids and attention_mask
    token_type_ids = []
    attention_mask = []
    
    for sample in input_ids:
        token_type_ids_sample = []
        attention_mask_sample = []
        
        for token in sample:
            token_type_ids_sample.append(0)    # we dun care, keep it a list of 0 for token_type_ids
            
            if token is not 0:                 # if token is not 0 (meaning it is not [PAD])
                attention_mask_sample.append(1)
            else:
                attention_mask_sample.append(0)    # no attention to [PAD]
                
        token_type_ids.append(token_type_ids_sample)
        attention_mask.append(attention_mask_sample)
            
    dictforDataset = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
    dataset = Dataset.from_dict(dictforDataset)     # Convert it from dict 
    
    return dataset

#### 1. Try with `chunked_trainDataset.csv`
- From `chunked_trainDataset.csv`: with 154696 training samples, each of them is 128 tokens long

In [4]:
df_c= pd.read_csv("chunked_trainDataset.csv")   # name of the .csv file you want to convert into Dataset
df_c = df_c.drop('Unnamed: 0', 1)
trainDataset_c = creatingDatasetPd(df_c)

In [5]:
### EXAMPLE - CHECKING ITS DATATYPE AND LENGTH 
print(trainDataset_c)
print()
print('Example input_ids: ', trainDataset_c['input_ids'][0])
print('Length: ', len(trainDataset_c['input_ids'][0]))
print()
print('Example token_type_ids: ', trainDataset_c['token_type_ids'][0])
print('Length: ', len(trainDataset_c['token_type_ids'][0]))
print()
print('Example attention_mask: ', trainDataset_c['attention_mask'][0])
print('Length: ', len(trainDataset_c['attention_mask'][0]))

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 154696
})

Example input_ids:  [101, 1509, 2592, 8179, 2229, 185, 1161, 2495, 1204, 12754, 175, 1207, 15415, 14375, 1116, 174, 7501, 8974, 5531, 2229, 185, 1161, 11937, 7577, 3839, 9505, 17811, 20994, 185, 1513, 12602, 174, 3101, 17268, 185, 1673, 1818, 12858, 25632, 20557, 6873, 5552, 11769, 7409, 4233, 2620, 4248, 15070, 6719, 3621, 2660, 16418, 2050, 14196, 27316, 2999, 16973, 1933, 1286, 13093, 9046, 1439, 7209, 3077, 1181, 3105, 14701, 8362, 16996, 23822, 1895, 13306, 19353, 24211, 1785, 16530, 1286, 3971, 5001, 10346, 2382, 8351, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 1965, 102, 101, 1509, 2592, 8179, 2229, 185, 1161, 2495, 1204, 12754, 1607, 175, 1603, 1757, 2184, 5531, 2229, 185, 1161, 11937, 7577, 9505, 17688, 2394, 2050, 14196, 20844, 5815, 14255, 18834, 1116, 2999, 26600, 191, 2225, 21608, 5332]
Length:  128

Example token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [34]:
# Can use add_column to add a test set into the same dictionary but not really necessary 

#### 2. Try with `padded_trainDataset.csv`
- From `padded_trainDataset.csv`: with 222062 training samples, each of them is 300 tokens long
- No attention to [PAD] tokens 

In [6]:
df_p = pd.read_csv("padded_trainDataset.csv")   # name of the .csv file you want to convert into Dataset
df_p = df_p.drop('Unnamed: 0', 1)
trainDataset_p = creatingDatasetPd(df_p)

In [7]:
### EXAMPLE - CHECKING ITS DATATYPE AND LENGTH 
print(trainDataset_p)
print()
print('Example input_ids: ', trainDataset_p['input_ids'][0])
print('Length: ', len(trainDataset_p['input_ids'][0]))
print()
print('Example token_type_ids: ', trainDataset_p['token_type_ids'][0])
print('Length: ', len(trainDataset_p['token_type_ids'][0]))
print()
print('Example attention_mask: ', trainDataset_p['attention_mask'][0])
print('Length: ', len(trainDataset_p['attention_mask'][0]))

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 222062
})

Example input_ids:  [101, 1509, 2592, 8179, 2229, 185, 1161, 2495, 1204, 12754, 175, 1207, 15415, 14375, 1116, 174, 7501, 8974, 5531, 2229, 185, 1161, 11937, 7577, 3839, 9505, 17811, 20994, 185, 1513, 12602, 174, 3101, 17268, 185, 1673, 1818, 12858, 25632, 20557, 6873, 5552, 11769, 7409, 4233, 2620, 4248, 15070, 6719, 3621, 2660, 16418, 2050, 14196, 27316, 2999, 16973, 1933, 1286, 13093, 9046, 1439, 7209, 3077, 1181, 3105, 14701, 8362, 16996, 23822, 1895, 13306, 19353, 24211, 1785, 16530, 1286, 3971, 5001, 10346, 2382, 8351, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 1965, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Next steps: replace the above files with the masked ones, then can carry on with training

==================================================================================================

## Trying the DataLoader

In [13]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64

train_dataloader = DataLoader(
    trainDataset_c,
    shuffle=True,
    batch_size=batch_size,
    #collate_fn=data_collator,
)