# Install All the Required Packages

In [1]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets

In [14]:
# pip install tensorboardX

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import random  
from collections import Counter  
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the Instruction Backtranslation model and tokenizer
model_name = "Arnab13/Llama2-7b-BackInstruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:41<00:00, 20.62s/it]


## Data Pre-processing
* Filtering out any multi-turn examples.
* Randomly sample a subset of 150 completions from the LIMA dataset

In [77]:
dataset = load_dataset("GAIR/lima")  
  
# Filter out multi-turn conversations  
train_data = [row for row in dataset['train'] if row["source"] != "multi_turn"]  
  
# Extract instructions and responses  
instructions = [row['conversations'][0] for row in train_data]  
responses = [row['conversations'][1] for row in train_data]  
  
# Calculate the number of tokens  
def num_tokens(text):  
    tokens = tokenizer(text, return_tensors="pt")["input_ids"]  
    return len(tokens[0])  
  
response_token_lengths = [num_tokens(response) for response in responses]  
instruction_token_lengths = [num_tokens(instruction) for instruction in instructions]  

In [80]:
# Calculate quantiles  
sorted_token_lengths = sorted(response_token_lengths)  
lower_bound = sorted_token_lengths[len(sorted_token_lengths) // 4]  
upper_bound = sorted_token_lengths[len(sorted_token_lengths) // 2]  

In [81]:
# Filter based on quantiles and sample 150 items  
filtered_indices = [i for i, length in enumerate(response_token_lengths) if lower_bound <= length <= upper_bound]  
random.seed(42)  
sampled_indices = random.sample(filtered_indices, 150)  

In [87]:
# Generate instructions  
generated_instructions = []  
for i in tqdm(sampled_indices):  
    prompt = responses[i]  
    max_length = response_token_lengths[i] + 80  
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)  
    result = pipe(f"<s>[INST] {prompt} [/INST]")  
    instruction = result[0]['generated_text'].split('[/INST]')[-1].strip()  
    generated_instructions.append(instruction) 

100%|██████████| 150/150 [04:21<00:00,  1.74s/it]


In [88]:
# Collect results into a list of dictionaries  
filtered_data = []  
for idx in sampled_indices:  
    filtered_data.append({  
        'generated_instruction': generated_instructions.pop(0) ,
        'response': responses[idx]
    })  

In [94]:
df=pd.DataFrame(filtered_data)

## Let's print out 5 examples of Generated Instructions

In [105]:
for i in range(5):
    print('Example no. '+str(i+1)+' \n')
    print('Generated Instruction: ' + df['generated_instruction'].values[i])
    print('Response:' + df['response'].values[i])
    print('\n')

Example no. 1 

Generated Instruction: I want to create a plot using Python. What is the best library to use?
Response:[Matplotlib](https://matplotlib.org/) is arguably the most popular graphing and data visualization library for Python. Below is a working example of using Matplotlib to draw a plot.

First of all, you’ll need to install the Matplotlib package. The easiest way to install matplotlib is to use pip. Type following command in terminal: 
```
pip install matplotlib
```

Now we are ready to draw a line graph, and here is an example of how we can achieve this.

```
# Importing the required module
import matplotlib.pyplot as plt

# Define x-axis and y-axis values
x_values = [1, 2, 3]
y_values = [2, 4, 1]

# Create a figure and a set of subplots
fig, ax = plt.subplots()

# Plot the points
ax.plot(x_values, y_values)

# Set x-axis label
ax.set_xlabel('X-Axis')

# Set y-axis label
ax.set_ylabel('Y-Axis')

# Set title
ax.set_title('My First Graph')

# Display the plot
plt.show()

``

In [89]:
# Convert the data to a Hugging Face Dataset  
dataset = Dataset.from_pandas(pd.DataFrame(filtered_data))

In [91]:
dataset.push_to_hub("Arnab13/LIMA-Generated-Instruct")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 480.28ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Arnab13/LIMA-Generated-Instruct/commit/7af0d1c3369da114f8a8b70798730a33fc5b3de2', commit_message='Upload dataset', commit_description='', oid='7af0d1c3369da114f8a8b70798730a33fc5b3de2', pr_url=None, pr_revision=None, pr_num=None)