In [71]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import json
from tqdm import tqdm
from huggingface_hub import snapshot_download
from pathlib import Path
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [4]:
# Step 1: Load the tokenizer and model with quantization
model_name = "mistralai/Mistral-7B-v0.3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    trust_remote_code=True
)

In [69]:
# Step 2: Create the new directories
def setup_directories():
    os.makedirs("data/lima", exist_ok=True)
    os.makedirs("data/oasst1", exist_ok=True)

setup_directories()

In [72]:
# Step 2: Load and process LIMA dataset

# Load LIMA dataset
dataset_lima = load_dataset("GAIR/lima")["train"]

# Process LIMA dataset
def process_lima(dataset):
    processed_records = []
    for record in dataset:
        prompt = record["conversations"][0]
        response = record["conversations"][1]
        processed_records.append({"prompt": prompt, "response": response})
    return pd.DataFrame(processed_records)

# Split and save the dataset
def split_and_save_lima(df, output_dir):
    train, temp = train_test_split(df, test_size=0.2, random_state=42)
    val, test = train_test_split(temp, test_size=0.5, random_state=42)
    
    train.to_json(os.path.join(output_dir, "lima_processed_train.json"), orient="records", lines=True, force_ascii=False)
    val.to_json(os.path.join(output_dir, "lima_processed_val.json"), orient="records", lines=True, force_ascii=False)
    test.to_json(os.path.join(output_dir, "lima_processed_test.json"), orient="records", lines=True, force_ascii=False)

# Process and save LIMA dataset
lima_df = process_lima(dataset_lima)
split_and_save_lima(lima_df, "data/lima")

In [73]:
# Step 3: Load and process OpenAssistant Conversations Dataset
# Define file paths
splits = {
    'train': 'data/train-00000-of-00001-b42a775f407cee45.parquet', 
    'validation': 'data/validation-00000-of-00001-134b8fd0c89408b6.parquet'
}

# Function to process and save the dataset
def process_and_save_oasst1(split, output_file):
    # Load the dataset
    df = pd.read_parquet("hf://datasets/OpenAssistant/oasst1/" + split)
    # Filter for assistant messages and select relevant columns
    cleaned_df = df[df["role"] == "assistant"][["message_id", "parent_id", "text"]].copy()
    # Rename the text column to response
    cleaned_df.rename(columns={"text": "response"}, inplace=True)
    # Add a new column with the prompt for each assistant message
    cleaned_df["prompt"] = cleaned_df["parent_id"].apply(
        lambda x: df[df["message_id"] == x]["text"].values[0]
        if x in df["message_id"].values else None
    )
    # Remove rows with missing prompts
    cleaned_df.dropna(subset=["prompt"], inplace=True)
    # Remove the message_id and parent_id columns
    cleaned_df.drop(columns=["message_id", "parent_id"], inplace=True)
    # Reset the index
    cleaned_df.reset_index(drop=True, inplace=True)
    # Save to JSON
    cleaned_df.to_json(output_file, orient="records", lines=False, force_ascii=False, indent=4)

# Apply the function to the dataset
process_and_save_oasst1(splits["train"], "data/oasst1/oasst1_processed_train.json")
process_and_save_oasst1(splits["validation"], "data/oasst1/oasst1_processed_test.json")

In [17]:
# Step 3: Generate predictions on the dataset
output_file = "model_responses.json"
with open(output_file, 'w', encoding='utf-8') as f_out:
    for sample in tqdm(dataset['train']):   # Use 'validation' or 'train' split if 'test' is not available
        input_text = sample['prompt']  # Adjust the field name based on the dataset's structure

        # Prepare the input prompt
        prompt = input_text

        # Tokenize input
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

        # Generate output
        outputs = model.generate(
            inputs,
            max_length=256,
            eos_token_id=tokenizer.eos_token_id,
        )

        # Decode output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Since the model may include the prompt in its output, we extract the generated response
        response = generated_text[len(prompt):]

        # Prepare the JSON object
        json_obj = {
            "prompt": prompt,
            "response": response
        }

        # Write the JSON object to file
        f_out.write(json.dumps(json_obj) + '\n')

In [None]:


def convert_to_common_format(dataset, source):
    formatted_data = []
    for record in dataset:
        if source == "OASST1":
            prompt = "\n".join([f"{turn['role'].capitalize()}: {turn['text']}" for turn in record["turns"][:-1]])
            response = record["turns"][-1]["text"]
        elif source == "LIMA":
            prompt = record["conversations"][0]
            response = record["conversations"][1]
        formatted_data.append({"prompt": prompt, "response": response})
    return formatted_data

def process_oasst1(record):
    








In [None]:
import pandas as pd

# Cargar el archivo (asumiendo que está en formato CSV o JSON)
# Reemplaza con la ruta correcta al dataset estructurado
data = pd.read_csv("oasst_dataset.csv")  # Cambia a `pd.read_json` si el archivo es JSON

# Filtrar solo las filas que son de interés
filtered_data = data[data['role'].isin(['prompter', 'assistant'])]

# Crear pares prompt-response
pairs = []
current_prompt = None

for _, row in filtered_data.iterrows():
    if row['role'] == 'prompter':
        # Guardar el prompt actual
        current_prompt = row['text']
    elif row['role'] == 'assistant' and current_prompt:
        # Crear un par de prompt y respuesta
        pairs.append({
            "prompt": current_prompt,
            "response": row['text']
        })
        current_prompt = None  # Resetear el prompt actual después de encontrar la respuesta

# Convertir los pares a un DataFrame
pairs_df = pd.DataFrame(pairs)

# Guardar el dataset procesado
pairs_df.to_csv("processed_oasst_dataset.csv", index=False)
