# Scrapping from sunnah.com

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fungsi untuk mendapatkan data hadis dari sebuah URL buku
def get_hadiths_from_book(url):
    print(f"Mengakses URL buku: {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    hadiths_data = []
    
    # Ambil judul buku
    book_title_element = soup.find('div', class_='book_page_english_name')
    book_title = book_title_element.text.strip() if book_title_element else "Judul Tidak Ditemukan"
    print(f"Judul buku ditemukan: {book_title}")
    
    # Loop melalui setiap hadis
    hadith_containers = soup.find_all('div', class_='actualHadithContainer')
    print(f"Menemukan {len(hadith_containers)} hadis dalam buku ini.")
    
    for hadith_container in hadith_containers:
        # Ambil narator hadis
        narrator = hadith_container.find('div', class_='hadith_narrated')
        narrator_text = narrator.text.strip() if narrator else "N/A"
        
        # Ambil teks hadis
        hadith_text = hadith_container.find('div', class_='text_details')
        hadith_text = hadith_text.text.strip() if hadith_text else "N/A"
        
        # Ambil referensi hadis
        reference = hadith_container.find('div', class_='hadith_reference_sticky')
        reference_text = reference.text.strip() if reference else "N/A"
        
        # Simpan data ke dalam list
        hadiths_data.append({
            'Rawi': url.split('/')[-2].capitalize(),  # Nama periwayat dari URL
            'Chapter': book_title,  # Judul buku
            'Reference': reference_text,  # Referensi hadis
            'Narator': narrator_text,  # Narator hadis
            'Hadiths Text': hadith_text  # Teks hadis
        })
    
    print(f"Berhasil mengumpulkan {len(hadiths_data)} hadis dari buku ini.\n")
    return hadiths_data

# Fungsi untuk mendapatkan semua buku dari sebuah periwayat
def get_books_from_author(base_url):
    books_data = []
    book_number = 1
    
    print(f"\nMemulai scraping untuk periwayat: {base_url}")
    
    while True:
        url = f"{base_url}/{book_number}"
        print(f"Mengecek URL buku: {url}")
        response = requests.get(url)
        
        # Jika halaman tidak ditemukan, hentikan loop
        if response.status_code != 200:
            print(f"Buku dengan nomor {book_number} tidak ditemukan. Menghentikan proses untuk periwayat ini.\n")
            break
        
        # Ambil data hadis dari buku tersebut
        books_data.extend(get_hadiths_from_book(url))
        book_number += 1
    
    print(f"Total {len(books_data)} hadis ditemukan untuk periwayat {base_url}.\n")
    return books_data

# List URL periwayat
authors_urls = [
    "https://sunnah.com/bukhari",
    "https://sunnah.com/muslim",
    "https://sunnah.com/nasai",
    "https://sunnah.com/abudawud",
    "https://sunnah.com/ibnmajah",
    "https://sunnah.com/malik",
    "https://sunnah.com/ahmad",
    "https://sunnah.com/riyadussalihin",
    "https://sunnah.com/adab",
    "https://sunnah.com/shamail",
    "https://sunnah.com/mishkat",
    "https://sunnah.com/hisn"
]

# Kumpulkan semua data hadis
all_hadiths = []
for author_url in authors_urls:
    print(f"\n=== Memulai scraping untuk periwayat: {author_url} ===")
    all_hadiths.extend(get_books_from_author(author_url))

# Buat dataframe dari data yang dikumpulkan
df = pd.DataFrame(all_hadiths, columns=['Rawi', 'Chapter', 'Reference', 'Narator', 'Hadiths Text'])

# Simpan dataframe ke dalam file CSV
df.to_csv('hadiths_data.csv', index=False, encoding='utf-8')
print("\n=== Proses scraping selesai ===")
print(f"Total {len(df)} hadis berhasil dikumpulkan dan disimpan ke dalam file hadiths_data.csv.")

# Data pre-processing

The data limited to 1000 row due to Gemini API Call limitation for synthetic data generation

In [None]:
import pandas as pd
df = pd.read_csv('hadiths_data.csv')
df["Hadiths Text"] = df["Hadiths Text"].astype(str).apply(lambda x: " ".join(x.replace("\n", " ").replace("\t", " ").replace("\r", " ").split()))
df.groupby('Rawi').count()
df = df.groupby('Rawi').apply(lambda x: x.sample(n=85, random_state=1)).reset_index(drop=True)
df

# Generate ID, Extract Hadith's Number

In [None]:
import hashlib

# Function to generate a unique ID based on multiple columns
def generate_id(row):
    unique_string = f"{row['Rawi']}_{row['Chapter']}_{row['Reference']}_{row['Narator']}_{row['Hadiths Text']}"
    return hashlib.md5(unique_string.encode()).hexdigest()

# Apply the function to each row to create a new 'ID' column
df['ID'] = df.apply(generate_id, axis=1)

# Move the 'ID' column to the first position
columns = ['ID'] + [col for col in df.columns if col != 'ID']
df = df[columns]

# Display the DataFrame with the 'ID' column as the first column
df.to_csv('hadiths_data_with_questions_final.csv', index=False)
df.head()

In [None]:
import numpy as np
import pandas as pd

# Function to extract the number from the 'Reference' column
def extract_hadith_number(reference):
    if pd.isna(reference):
        return np.nan
    return reference.split()[-1]

# Apply the function to create the 'Hadith Number' column
df['Hadith Number'] = df['Reference'].apply(extract_hadith_number)

# Move the 'Hadith Number' column to the 5th position
columns = df.columns.tolist()
columns.insert(4, columns.pop(columns.index('Hadith Number')))
df = df[columns]

# Display the DataFrame with the 'Hadith Number' column in the 5th position
df

# Generate Question and Answer using GEMINI API

In [None]:
import numpy as np
import google.generativeai as genai
import pandas as pd
import os
import time
import logging

# --- Configuration ---

# Configure logging
logging.basicConfig(
    filename='generation_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# --- Data Loading and Preprocessing ---

# Load the dataset
try:
    df = pd.read_csv('hadiths_data.csv')
    logging.info("Successfully loaded 'hadiths_data.csv'") # add log
except FileNotFoundError:
    logging.error("Error: 'hadiths_data.csv' not found.  Please ensure the file exists.")
    exit()
except Exception as e:
    logging.error(f"Error loading 'hadiths_data.csv': {e}")
    exit()


# Clean and preprocess the 'Hadiths Text' column
try:
    df["Hadiths Text"] = df["Hadiths Text"].astype(str).apply(
        lambda x: " ".join(x.replace("\n", " ")
                             .replace("\t", " ")
                             .replace("\r", " ")
                             .split()))
    logging.info("Successfully preprocessed 'Hadiths Text' column.")
except KeyError as e:
    logging.error(f"Error: Missing column '{e}' in the dataset. Please check the column names.")
    exit()
except Exception as e:
    logging.error(f"Error preprocessing 'Hadiths Text' column: {e}")
    exit()

# Optionally, sample the data to reduce the number of data points
try:
    df = df.groupby('Rawi').apply(
        lambda x: x.sample(n=85, random_state=1)
    ).reset_index(drop=True)
    logging.info("Successfully sampled data by 'Rawi'.")
except KeyError as e:
    logging.warning(f"Column 'Rawi' not found. Sampling by 'Rawi' will be skipped.")
except Exception as e:
    logging.warning(f"Error sampling data by 'Rawi': {e}. Sampling step will be skipped.")
    # Continue without sampling if there's an error
    pass  # or consider setting `df` to original data:  df = pd.read_csv('hadiths_data.csv')

# --- Gemini API Setup ---

# Retrieve Google AI API Key from environment variables
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
if not GOOGLE_API_KEY:
    logging.error("Error: GEMINI_API_KEY environment variable not set.")
    exit() # exit to not continue the code if there are no API_KEY

genai.configure(api_key=GOOGLE_API_KEY)

# Load the Gemini model
try:
    model = genai.GenerativeModel("gemini-1.5-flash")  # Or "gemini-1.5-pro" if available
    logging.info("Successfully loaded Gemini model.")
except Exception as e:
    logging.error(f"Error loading Gemini model: {e}")
    exit()

# --- Question Generation Function ---

def generate_questions_batch(hadiths, indices, retries=3):
    """
    Generates questions for a batch of hadiths using the Gemini API.

    Args:
        hadiths (list): A list of hadith texts.
        indices (list): A list of indices corresponding to the hadiths in the original DataFrame.
        retries (int): The number of retries if the API call fails.

    Returns:
        list: A list of generated questions.  Returns error messages if generation fails.
    """
    response_texts = []
    for hadith, index in zip(hadiths, indices): # Use zip for iteration
        attempt = 0
        while attempt < retries:
            try:
                prompt = f"""Generate a question in interrogative form, based on the following hadith as the answer:\n\n{hadith}\n\n"""
                response = model.generate_content(prompt, generation_config=genai.types.GenerationConfig(
                    max_output_tokens=128,
                    temperature=0.8
                ))

                generated_text = response.text.strip() if response else "Error: No response"
                print(f"\n[DEBUG] Row {index}") # debugging
                print(f"[DEBUG] Generated Question: {generated_text}\n") # debugging
                response_texts.append(generated_text)
                logging.info(f"Row {index} - Generated question: {generated_text[:50]}...") # add log
                break  # If successful, break the retry loop
            except Exception as e:
                attempt += 1
                error_message = str(e)
                logging.error(f"Row {index} - Attempt {attempt} - Error: {error_message}")
                if attempt == retries:
                    response_texts.append(f"Error after {retries} attempts: {error_message}")
                    break  # Break the loop if retries are exhausted
                else:
                    time.sleep(5)  # Wait before retrying

    return response_texts

# --- Main Processing Loop ---

# Set batch size and initialize question list
batch_size = 7  # Adjust batch size based on API limits and performance
questions = [""] * len(df) # Initialize list with empty strings to match the dataframe length
checkpoint_interval = 2  # Save progress every X batches

for i in range(0, len(df), batch_size):
    # Create a batch of hadiths and indices
    batch_hadiths = df['Hadiths Text'][i:i + batch_size].tolist()
    batch_indices = list(range(i, min(i + batch_size, len(df))))

    # Generate questions for the batch
    batch_questions = generate_questions_batch(batch_hadiths, batch_indices)

    # Update the questions list with the generated questions
    for idx, question in zip(batch_indices, batch_questions):
        questions[idx] = question

    # Introduce a random sleep interval to be kind to the API
    sleep_duration = np.random.uniform(8, 10)  # Sleep for a random duration between 8 and 10 seconds
    print(f"Sleeping for {sleep_duration:.2f} seconds...") # debug
    time.sleep(sleep_duration)

    # Save progress periodically
    if (i // batch_size) % checkpoint_interval == 0 and i > 0:  # Save every checkpoint_interval batches, skip the very first iteration
        df['Sample Question'] = questions
        try:
            df.to_csv('hadiths_data_with_questions_checkpoint.csv', index=False)
            logging.info(f"Checkpoint saved at batch {i // batch_size}. Rows processed: {i}") # add log
        except Exception as e:
             logging.error(f"Error saving checkpoint at batch {i // batch_size}: {e}")

# Save final results
df['Sample Question'] = questions
try:
    df.to_csv('hadiths_data_with_questions_final.csv', index=False)
    logging.info("Final results saved to 'hadiths_data_with_questions_final.csv'") # add log
except Exception as e:
    logging.error(f"Error saving final results: {e}")


print("Question generation complete.") # add confirmation print statement

In [None]:
import numpy as np
import google.generativeai as genai
import pandas as pd
import os
import time
import logging

# --- Configuration ---

# Configure logging
logging.basicConfig(
    filename='generation_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# --- API Key Setup ---

# Retrieve Google AI API Key from environment variables
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
if not GOOGLE_API_KEY:
    logging.error("Error: GEMINI_API_KEY environment variable not set.")
    exit() # exit to not continue the code if there are no API_KEY

genai.configure(api_key=GOOGLE_API_KEY)

# --- Model Loading ---

# Load the Gemini model
try:
    model = genai.GenerativeModel("gemini-1.5-flash")  # Or "gemini-1.5-pro" if available
    logging.info("Successfully loaded Gemini model.")
except Exception as e:
    logging.error(f"Error loading Gemini model: {e}")
    exit()

# --- Data Loading and Preprocessing ---

# Load the dataset
try:
    df = pd.read_csv('hadiths_data.csv')
    logging.info("Successfully loaded 'hadiths_data.csv'")
except FileNotFoundError:
    logging.error("Error: 'hadiths_data.csv' not found. Please ensure the file exists.")
    exit()
except Exception as e:
    logging.error(f"Error loading 'hadiths_data.csv': {e}")
    exit()

# Ensure 'Sample Question' column exists. Create if missing and fill with empty string
if 'Sample Question' not in df.columns:
    df['Sample Question'] = ""
    logging.warning("Column 'Sample Question' not found. Created and initialized with empty strings.")

# Clean and preprocess the 'Hadiths Text' column (If not already preprocessed)
# This part is optional, if the cleaning already done
try:
    df["Hadiths Text"] = df["Hadiths Text"].astype(str).apply(
        lambda x: " ".join(x.replace("\n", " ")
                             .replace("\t", " ")
                             .replace("\r", " ")
                             .split()))
    logging.info("Successfully preprocessed 'Hadiths Text' column.")
except KeyError as e:
    logging.error(f"Error: Missing column '{e}' in the dataset. Please check the column names.")
    exit()
except Exception as e:
    logging.error(f"Error preprocessing 'Hadiths Text' column: {e}")
    exit()

# --- Question Generation Function ---

def generate_questions_batch(hadiths, indices, retries=3):
    """
    Generates questions for a batch of hadiths using the Gemini API.

    Args:
        hadiths (list): A list of hadith texts.
        indices (list): A list of indices corresponding to the hadiths in the original DataFrame.
        retries (int): The number of retries if the API call fails.

    Returns:
        list: A list of generated questions. Returns error messages if generation fails.
    """
    response_texts = []
    for hadith, index in zip(hadiths, indices): # Use zip for iteration
        attempt = 0
        while attempt < retries:
            try:
                prompt = f"""Generate a question in interrogative form, based on the following hadith as the answer:\n\n{hadith}\n\n"""
                response = model.generate_content(prompt, generation_config=genai.types.GenerationConfig(
                    max_output_tokens=128,
                    temperature=0.8
                ))

                generated_text = response.text.strip() if response else "Error: No response"
                print(f"\n[DEBUG] Row {index}") # debugging
                print(f"[DEBUG] Generated Question: {generated_text}\n") # debugging
                response_texts.append(generated_text)
                logging.info(f"Row {index} - Generated question: {generated_text[:50]}...") # add log
                break  # If successful, break the retry loop
            except Exception as e:
                attempt += 1
                error_message = str(e)
                logging.error(f"Row {index} - Attempt {attempt} - Error: {error_message}")
                if attempt == retries:
                    response_texts.append(f"Error after {retries} attempts: {error_message}")
                    break  # Break the loop if retries are exhausted
                else:
                    time.sleep(5)  # Wait before retrying

    return response_texts

# --- Main Processing Loop ---

# Set batch size and other parameters
batch_size = 7  # Adjust based on API limits and performance
checkpoint_interval = 2  # Save progress every X batches

# Identify rows with null values in 'Sample Question'
null_indices = df[df['Sample Question'].isnull() | (df['Sample Question'] == "")].index.tolist()  # Include empty strings too

# Start processing only if there are rows to process
if null_indices:
    print(f"Generating questions for {len(null_indices)} hadiths...")
    for i in range(0, len(null_indices), batch_size):
        # Create a batch of hadiths and indices
        batch_indices = null_indices[i:i + batch_size]
        batch_hadiths = df.loc[batch_indices, 'Hadiths Text'].tolist()

        # Generate questions for the batch
        batch_questions = generate_questions_batch(batch_hadiths, batch_indices)

        # Update the 'Sample Question' column with generated questions
        for idx, question in zip(batch_indices, batch_questions):
            df.loc[idx, 'Sample Question'] = question

        # Introduce a random sleep interval
        sleep_duration = np.random.uniform(9, 10) # Sleep a bit longer
        print(f"Sleeping for {sleep_duration:.2f} seconds...") # debug
        time.sleep(sleep_duration)

        # Save progress periodically
        if (i // batch_size) % checkpoint_interval == 0 and i > 0:  # Skip first iteration
            try:
                df.to_csv('hadiths_data_with_questions_checkpoint.csv', index=False)
                logging.info(f"Checkpoint saved at batch {i // batch_size}. Rows processed: {i}") # add log
            except Exception as e:
                logging.error(f"Error saving checkpoint at batch {i // batch_size}: {e}")
else:
    print("No rows with missing questions found. Skipping question generation.")
    logging.info("No rows with missing questions found. Skipping question generation.")
# --- Save Results ---
try:
    df.to_csv('hadiths_data_with_questions_final.csv', index=False)
    logging.info("Final results saved to 'hadiths_data_with_questions_final.csv'") # add log
except Exception as e:
    logging.error(f"Error saving final results: {e}")
# --- Display Result (Optional) ---
print("\n=== Final DataFrame ===")
print(df) # Print the dataframe

print("Question generation complete.")

# Document Creation from dataset using ChromaDB

In [None]:
import chromadb
from chromadb.utils import embedding_functions
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load the DataFrame
df = pd.read_csv('hadiths_data_with_question_and_answers_final.csv')

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="hadith_synthetic_rag_source") 

# Create a collection
collection = client.create_collection(name="hadith_synthetic_rag_source_complete")

# Load the pre-trained model with CUDA
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Create documents from the DataFrame
documents = [
    f"""Rawi: {row['Rawi']}\n
    Chapter: {row['Chapter']}\n
    Reference: {row['Reference']}\n
    Hadith Number: {row['Hadith Number']}\n
    Narator: {row['Narator']}\n
    Hadith Text: {row['Hadiths Text']}\n
    Sample Question: {row['Sample Question']}\n
    Answer: {row['Synthetic Answer']}\n
    """

    for _, row in df.iterrows()
]
ids = [f"row_{i}" for i in range(len(documents))]

# Compute embeddings using CUDA
embeddings = model.encode(documents, convert_to_tensor=True, device=device)

# Add documents to the collection
collection.add(
    documents=documents,
    ids=ids,
    embeddings=embeddings.cpu().numpy()  # Convert embeddings to numpy array
)

# Debugging print
print(f"Number of documents in collection: {collection.count()}")

# Creating the chatbot pipeline with gradio interface

In [None]:
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import chromadb
import torch
from sentence_transformers import SentenceTransformer
import os
from chromadb.utils import embedding_functions
import pandas as pd


# Load the DataFrame
df = pd.read_csv('hadiths_data_with_question_and_answers_final.csv')

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="hadith_synthetic_rag_source") 

# Load the collection
collection_name = "hadith_synthetic_rag_source_complete"
collection = client.get_collection(collection_name)

# Debugging print to verify the number of documents in the collection
print(f"Number of documents in collection: {collection.count()}")

# Model and Tokenizer Loading
# model_name = "google/flan-t5-base"
token = os.getenv("HUGGINGFACE_TOKEN")
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16, pad_token_id=tokenizer.eos_token_id)

# Load the pre-trained model and tokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
retrieval_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Function to query the collection
def query_collection(query, n_results):
    # Compute the embedding for the query
    query_embedding = retrieval_model.encode([query], convert_to_tensor=True, device=device).cpu().numpy()
    
    # Query the collection
    results = collection.query(query_embeddings=query_embedding, n_results=n_results)
    
    return results

# Generate a response using the retrieved documents as context
def generate_response(context, question):
    prompt = f"Please provide a short, well-structured answer and avoids repetition from context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = llm.generate(**inputs, max_length=2048, num_return_sequences=1, num_beams=5, temperature=0.9, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Main chatbot function with basic RAG
def chatbot_response(user_query, top_k=2):
    # Step 1: Retrieve relevant documents
    results = query_collection(user_query, top_k)
    
    # Step 2: Combine retrieved documents into context
    documents = [doc for doc_list in results['documents'] for doc in doc_list]
    combined_context = "\n\n".join(documents)
    
    # Step 3: Generate a response using the combined context
    response = generate_response(combined_context, user_query)
    
    return response

# Global variable to control the processing state
stop_processing = False

def chatbot(query, num_candidates):
    global stop_processing
    stop_processing = False  # Reset stop flag at the beginning of each query

    # Jika query kosong, kembalikan pesan default
    if not query.strip():
        return "Please ask a question about hadiths."
    
    # Lakukan retrieval dan generation dengan Speculative RAG
    answer = chatbot_response(query, num_candidates)
    
    # Check if stop button was pressed
    if stop_processing:
        return "Processing was stopped by the user."

    # Format jawaban
    if "don't know" in answer.lower() or "not sure" in answer.lower():
        return "Sorry. I don't have information about the hadiths related. It might be a dhoif, or maudhu, or I just don't have the knowledge."
    else:
        return answer

def stop():
    global stop_processing
    stop_processing = True
    return "Processing stopped."

# Buat Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Burhan AI
        Assalamualaikum! I am Burhan AI, a chatbot that can help you find answers to your questions about hadiths. 
        \n 
        Please note that this is a demo version and may not be perfect.
        This chatbot is powered by the ChromaDB and Flan-T5-base models with RAG architecture.
        Flan-T5-base is a small model and may not be as accurate as the bigger models.
        If you have any feedback or suggestions, you can contact me at frendyrachman7@gmail.com
        \n
        Jazakallah Khairan!
        """
    )
    with gr.Row():
        query_input = gr.Textbox(lines=2, placeholder="Enter your question here...")
        num_candidates_input = gr.Slider(minimum=1, maximum=10, value=2, step=1, label="Number of References")
        submit_button = gr.Button("Submit")
    
    output_text = gr.Textbox(label="Response")

    submit_button.click(chatbot, inputs=[query_input, num_candidates_input], outputs=output_text)

    # Add a button to stop processing
    stop_button = gr.Button("Stop Processing")
    stop_output = gr.Textbox(visible=False)
    stop_button.click(stop, inputs=[], outputs=stop_output)

# Jalankan Gradio interface
demo.launch()

# Fine Tuning Preparation using unsloth and Llama 3.2 3b Instruct

In [None]:
import pandas as pd
from datasets import Dataset

# Load the DataFrame
df = pd.read_csv('hadiths_data_with_question_and_answers_final.csv')

# Create the 'conversation' column with the specified format
df['conversation'] = df.apply(lambda row: [
    {"from": "system", "value": "You are an AI assistant for hadiths. Please provide a response based on the hadith."},
    {"from": "human", "value": row['Sample Question']},
    {"from": "system", "value": row['Hadiths Text']},
    {"from": "gpt", "value": row['Synthetic Answer']}
], axis=1)

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns(['Rawi', 'Chapter', 'Reference', 'Narator', 'Hadiths Text', 'Sample Question', 'Synthetic Answer', 'Hadith Number'])   # Remove unnecessary columns
dataset

In [None]:
import pandas as pd
from datasets import Dataset
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import standardize_sharegpt

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# Load the DataFrame
df = pd.read_csv('hadiths_data_with_question_and_answers_final.csv')

# Create the 'conversation' column with the specified format
df['conversations'] = df.apply(lambda row: [
    {"from": "system", "value": "You are an AI assistant for hadiths. Please provide a response based on the hadith."},
    {"from": "human", "value": row['Sample Question']},
    {"from": "gpt", "value": row['Synthetic Answer']}
], axis=1)

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)   
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import Dataset

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

max_seq_length = 2048 # Maximum sequence length for the model
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False

token = os.getenv("HUGGINGFACE_TOKEN")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token=token
)

In [None]:
import pandas as pd
from datasets import Dataset
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import standardize_sharegpt

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# Load the DataFrame
df = pd.read_csv('hadiths_data_with_question_and_answers_final.csv')

# Create the 'conversation' column with the specified format
df['conversations'] = df.apply(lambda row: [
    {"from": "system", "value": "You are an AI assistant for hadiths. Please provide a response based on the hadith."},
    {"from": "human", "value": row['Sample Question']},
    {"from": "gpt", "value": row['Synthetic Answer']}
], axis=1)

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)   
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

In [None]:
trainer_stats = trainer.train()

inference

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Save the model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model") # Local saving
token = os.getenv("HUGGINGFACE_TOKEN")
model.push_to_hub("your_name/burhan-ai-finetuned-llama-3.2-3b-i", token=token) # Online saving
tokenizer.push_to_hub("your_name/burhan-ai-finetuned-llama-3.2-3b-i", token=token) # Online saving