# Data preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### imports

In [2]:
import pandas as pd
import numpy as np
from datasets import DatasetDict, Dataset

### load data

In [3]:
df = pd.read_csv('/content/drive/MyDrive/LLM Practice/DPO/idea-title_pairs-preferences.csv')

In [4]:
df.head()

Unnamed: 0,idea,title_a,title_b,title_b_preferred
0,text embedding models,Top 3 Text Embedding Models Every Data Scienti...,Text Embedding Models: From Theory to Practice,1
1,multimodal RAG,Master Multimodal RAG in Just 10 Minutes,Why Multimodal RAG Is the Next Big Thing in AI,0
2,model pruning,The Surprising Benefits of Model Pruning for B...,Reduce Model Size & Boost Performance with Pru...,1
3,"Fractals, self similarity, and crinkliness",The Beauty of Fractals: Exploring Infinite Cri...,The Math Behind Fractals: Crinkliness and Beyond,1
4,Fine-tuning LLMs for computer use,The Ultimate Guide to LLM Fine-Tuning,Boost Your LLM Skills: Fine-Tuning for Compute...,0


### Prompt to create titles

In [5]:
template = lambda idea : f"""Given the YouTube video idea write an engaging title.

**Video Idea**: {idea}

**Additional Guidance**:
- Title should be between 30 and 75 characters long
- Only return the title idea, nothing else!"""

In [6]:
def idea_to_prompt(idea):
    return [{"role": "user", "content": template(idea.lower())}]

In [7]:
df['prompt'] = df['idea'].apply(idea_to_prompt)

### create chosen and rejected responses

In [8]:
def title_to_completion(title):
    return [{"role": "assistant", "content": title}]

In [9]:
# create chosen and rejected columns
df['chosen'] = np.where(df['title_b_preferred'] == 1, df['title_b'].apply(title_to_completion), df['title_a'].apply(title_to_completion))
df['rejected'] = np.where(df['title_b_preferred'] == 1, df['title_a'].apply(title_to_completion), df['title_b'].apply(title_to_completion))

In [11]:
df.iloc[:,-3:].head()

Unnamed: 0,prompt,chosen,rejected
0,"[{'role': 'user', 'content': 'Given the YouTub...","[{'role': 'assistant', 'content': 'Text Embedd...","[{'role': 'assistant', 'content': 'Top 3 Text ..."
1,"[{'role': 'user', 'content': 'Given the YouTub...","[{'role': 'assistant', 'content': 'Master Mult...","[{'role': 'assistant', 'content': 'Why Multimo..."
2,"[{'role': 'user', 'content': 'Given the YouTub...","[{'role': 'assistant', 'content': 'Reduce Mode...","[{'role': 'assistant', 'content': 'The Surpris..."
3,"[{'role': 'user', 'content': 'Given the YouTub...","[{'role': 'assistant', 'content': 'The Math Be...","[{'role': 'assistant', 'content': 'The Beauty ..."
4,"[{'role': 'user', 'content': 'Given the YouTub...","[{'role': 'assistant', 'content': 'The Ultimat...","[{'role': 'assistant', 'content': 'Boost Your ..."


In [12]:
# write data to file
df.to_csv('/content/drive/MyDrive/LLM Practice/DPO/preferences.csv')

### create train-valid split

In [13]:
# shuffle dataframe
df_shuffled = df.iloc[:,-3:].sample(frac=1, random_state=42).reset_index(drop=True)

# 90-10 split
train_size = int(0.9 * len(df_shuffled))

# slice accordingly
df_train = df_shuffled.iloc[:train_size]
df_valid = df_shuffled.iloc[train_size:]

In [14]:
# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'valid': valid_ds,
})

# Push dataset to HuggingFace

In [15]:
! pip install datasets huggingface_hub



In [17]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `topicmodeling` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `topi

In [18]:
# push data to hub
dataset_dict.push_to_hub("EliasHossain/youtube-titles-dpo")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]