<a href="https://colab.research.google.com/github/datafyresearcher/datafy-finetuning-university/blob/main/notebooks/Basic/02_Custom_Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#===> Run this block, when using the Google Colab. Otherwise, do not run it.

if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  # Install the package
  ! pip install transformers==4.31.0 datasets==2.14.4 accelerate==0.22.0 -q
else:
  print('Not running on CoLab')

Running on CoLab
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Data preparation

In [2]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

### Tokenizing text

In [3]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [4]:
text = "Hi, how are you?"

In [5]:
encoded_text = tokenizer(text)["input_ids"]

In [6]:
encoded_text

[12764, 13, 849, 403, 368, 32]

In [7]:
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)

Decoded tokens back into text:  Hi, how are you?


### Tokenize multiple texts at once

In [8]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]


### Padding and truncation

In [9]:
tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

Using padding:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]


In [10]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

Using truncation:  [[12764, 13, 849], [42, 1353, 1175], [4374]]


In [11]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

Using left-side truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]


In [12]:
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Using both padding and truncation:  [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]


### Prepare instruction dataset

In [13]:
# Download the Dataset from Google Drive (Allied bank)
!gdown 1FXHHKceNzRDDtC53rqKpckMfB1nLptdv

Downloading...
From: https://drive.google.com/uc?id=1FXHHKceNzRDDtC53rqKpckMfB1nLptdv
To: /content/allied_bank.json
  0% 0.00/13.1k [00:00<?, ?B/s]100% 13.1k/13.1k [00:00<00:00, 48.6MB/s]


In [17]:
import pandas as pd

filename = "allied_bank.json"
instruction_dataset_df = pd.read_json(filename, lines=False)
examples = instruction_dataset_df.to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'Islamic banking is defined as banking system which is in '
           'consonance with the spirit, ethos and value system of Islam and '
           'governed by the principles laid down by Islamic Shariah. Interest '
           'free banking is a narrow concept denoting a number of banking '
           'instruments or operations which avoid interest. Islamic banking, '
           'the more general term, is based not only to avoid interest-based '
           'transactions prohibited in Islamic Shariah but also to avoid '
           'unethical and un-social practices. In practical sense, Islamic '
           'Banking is the transformation of conventional money lending into '
           'transactions based on tangible assets and real services. The model '
           'of Islamic banking system leads towards the achievement of a '
           'system which helps achieve economic prosperity.',
 'question': '### Question:\nWhat is Islamic Ba

### Tokenize a single example

In [18]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

[[ 4118 19782    27   187  1276   310 13281  6022   272    32   187   187
   4118 37741    27 34330   280 19714   310  2931   347 19714   985   534
    310   275   772 47872   342   253  5968    13  5105   375   285  1318
    985   273  8033   285 17886   407   253  9241 10090  1066   407 13281
   1608  8125    73    15  9535  1959 19714   310   247  6891  4473  1850
   5341   247  1180   273 19714 13225   390  5871   534  3693  1600    15
  13281 19714    13   253   625  2087  1307    13   310  1754   417   760
    281  3693  1600    14  3169 13122 19772   275 13281  1608  8125    73
    533   671   281  3693   440 49628   285   440    14 21637  8333    15
    496  8542  3282    13 13281  6022   272   310   253  9261   273  6041
   2583 32497   715 13122  1754   327 33631 10434   285  1524  3238    15
    380  1566   273 13281 19714   985  5644  4404   253 19797   273   247
    985   534  7729  5115  5054 30391    15]]


In [19]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [20]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [21]:
tokenized_inputs["input_ids"]

array([[ 4118, 19782,    27,   187,  1276,   310, 13281,  6022,   272,
           32,   187,   187,  4118, 37741,    27, 34330,   280, 19714,
          310,  2931,   347, 19714,   985,   534,   310,   275,   772,
        47872,   342,   253,  5968,    13,  5105,   375,   285,  1318,
          985,   273,  8033,   285, 17886,   407,   253,  9241, 10090,
         1066,   407, 13281,  1608,  8125,    73,    15,  9535,  1959,
        19714,   310,   247,  6891,  4473,  1850,  5341,   247,  1180,
          273, 19714, 13225,   390,  5871,   534,  3693,  1600,    15,
        13281, 19714,    13,   253,   625,  2087,  1307,    13,   310,
         1754,   417,   760,   281,  3693,  1600,    14,  3169, 13122,
        19772,   275, 13281,  1608,  8125,    73,   533,   671,   281,
         3693,   440, 49628,   285,   440,    14, 21637,  8333,    15,
          496,  8542,  3282,    13, 13281,  6022,   272,   310,   253,
         9261,   273,  6041,  2583, 32497,   715, 13122,  1754,   327,
      

### Tokenize the instruction dataset

In [22]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [23]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Dataset({
    features: ['answer', 'question', 'input_ids', 'attention_mask'],
    num_rows: 24
})


In [24]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

### Prepare test/train splits

In [25]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 21
    })
    test: Dataset({
        features: ['answer', 'question', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})


### Some datasets for you to try

In [26]:
finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = datasets.load_dataset(finetuning_dataset_path)
print(finetuning_dataset)

Downloading readme:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/615k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/140 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [27]:
taylor_swift_dataset = "lamini/taylor_swift"
bts_dataset = "lamini/bts"
open_llms = "lamini/open_llms"

In [28]:
dataset_swiftie = datasets.load_dataset(taylor_swift_dataset)
print(dataset_swiftie["train"][1])

Downloading readme:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/87 [00:00<?, ? examples/s]

{'question': 'What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?', 'answer': 'Taylor Swift\'s "Shake It Off" is the most popular song among millennials. This song relates to the millennial generation as it is an anthem of self-acceptance and embracing one\'s individuality. The song\'s message of not letting others bring you down and to just dance it off resonates with the millennial culture, which is often characterized by a strong sense of individuality and a rejection of societal norms. Additionally, the song\'s upbeat and catchy melody makes it a perfect fit for the millennial generation, which is known for its love of pop music.', 'input_ids': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24

# This is how to push your own dataset to your Huggingface hub

```python
!pip install huggingface_hub
!huggingface-cli login
split_dataset.push_to_hub(dataset_path_hf)
```