<div style="display:block;width:100%;margin:auto;" direction=rtl align=center>
    <br><br>
    <div style="width:100%;margin:100;display:block;background-color:#fff0;" display=block align=center>
        <table style="border-style:hidden;border-collapse:collapse;">
            <tr>
                <td style="border: none!important;">
                    <img width=130 align=right src="https://i.ibb.co/yXKQmtZ/logo1.png" style="margin:0;" />
                </td>
                <td style="text-align:center;border: none!important;">
                    <h1 align=center><font size=5 color="#045F5F"> <b> Large Language Models (LLM)</b><br><br>Final Project</font></h1>
                </td>
                <td style="border: none!important;">
                    <img width=170 align=left src="https://i.ibb.co/wLjqFkw/logo2.png" style="margin:0;" />
                </td>
            </tr>
        </table>
        <h1> Farzad Jannati- Abolfazl Asarian Nejad-Shahriar Rahimi Rad </h1>
        <h1> Prof. MJ. Dousti & Yadollah Yaghoobzadeh </h1>
    </div>
</div>

# Persian News Title Generation with Fine-Tuned Llama 3

>[Persian News Title Generation with Fine-Tuned Llama 3](#scrollTo=ruIIPk6sB0Kq)

>>[Title Generation Inference](#scrollTo=iuDItTAmBio6)

>>>[Import required libraries](#scrollTo=HysyMwHNBqpW)

>>>[Hugging Face Authentication](#scrollTo=5Z-hzQghV0RT)

>>>[Initialization and Pipeline Configuration](#scrollTo=_qFVhFwPBwzq)

>>>[Loading and Preprocessing the Hamshahri Dataset](#scrollTo=uOGn5ueZCaOi)

>>>[Generating a Title for a Single Text Input](#scrollTo=kH5E1BL8Cot2)

>>>[Performing Batch Inference on the Dataset](#scrollTo=tzdYUg8nCqag)

>>>[Saving Results to Multiple Formats (Excel, CSV, JSON, TXT)](#scrollTo=tM4VxYEICxRN)

>>>[Basic Analysis of the Generated Results](#scrollTo=ALqy38WzC6KI)

>>>[Main Function: Executing the Full Inference Pipeline](#scrollTo=BYThP65ADJFn)



## Title Generation Inference

### Import required libraries

In [22]:
!pip install -q bitsandbytes accelerate

In [23]:
!pip install -q transformers peft datasets

In [24]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import json
from datetime import datetime
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

### Hugging Face Authentication

In [49]:
from huggingface_hub import login

def authenticate_huggingface():
    """
    Authenticate with Hugging Face using access token
    """
    # Enter your Hugging Face token here
    HF_TOKEN = input("Please enter your Hugging Face access token: ")

    try:
        login(token=HF_TOKEN)
        print("✓ Successfully authenticated with Hugging Face!")
        return HF_TOKEN  # Return the token to use later
    except Exception as e:
        print(f"✗ Authentication failed: {e}")
        print("Please make sure your token is correct and has access to Llama models")
        return None

### Initialization and Pipeline Configuration

In [50]:
class PersianTitleInference:
    """
    A class for performing inference on Persian news articles using a fine-tuned Llama 3 model
    with LoRA adapters to generate appropriate titles.
    """

    def __init__(self, adapter_path=".", base_model_name=None, device=None, hf_token=None):
        """
        Initialize the inference pipeline with LoRA adapters.

        Args:
            adapter_path (str): Path to the directory containing LoRA adapter files
            base_model_name (str): Name of the base model (if None, will be loaded from adapter config)
            device (str): Device to run inference on ('cuda' or 'cpu')
            hf_token (str): Hugging Face authentication token
        """
        self.adapter_path = adapter_path
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        self.base_model_name = base_model_name
        self.hf_token = hf_token
        self.model = None
        self.tokenizer = None
        self.model_loaded = False

        # Inference template for Llama 3
        self.inference_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>
{user_message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

        self.system_message = "you are an ai assistant to generate persian title for given article"

    def load_model(self):
        """
        Load the base model and apply LoRA adapters for inference.
        """
        if self.model_loaded:
            print("Model already loaded.")
            return True

        try:
            print(f"Loading PEFT configuration from {self.adapter_path}...")

            # Load the PEFT configuration to get base model name if not provided
            peft_config = PeftConfig.from_pretrained(self.adapter_path)
            if not self.base_model_name:
                self.base_model_name = peft_config.base_model_name_or_path
            print(f"Using base model: {self.base_model_name}")

            # Configure 4-bit quantization for memory efficiency
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True
            )

            print(f"Loading base model: {self.base_model_name}")
            # Load base model with quantization and token
            self.model = AutoModelForCausalLM.from_pretrained(
                self.base_model_name,
                quantization_config=quantization_config,
                device_map="auto",
                trust_remote_code=True,
                token=self.hf_token  # Use the stored token
            )

            # Load tokenizer
            print("Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.base_model_name,
                token=self.hf_token  # Use the stored token
            )
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.padding_side = "left"

            # Apply LoRA adapters
            print(f"Applying LoRA adapters from {self.adapter_path}...")
            self.model = PeftModel.from_pretrained(self.model, self.adapter_path)

            # Set model to evaluation mode
            self.model.eval()

            self.model_loaded = True
            print("Model with LoRA adapters loaded successfully!")
            return True

        except Exception as e:
            print(f"Error loading model: {e}")
            return False

    def load_hamshahri_data(self, file_path):
        """
        Load and preprocess the Hamshahri news dataset.

        Args:
            file_path (str): Path to the Excel file containing news data

        Returns:
            pd.DataFrame: Preprocessed dataframe with news articles
        """
        print(f"Loading data from {file_path}...")

        try:
            # Load Excel file
            df = pd.read_excel(file_path)

            # Display dataset information
            print(f"Dataset shape: {df.shape}")
            print(f"Columns: {df.columns.tolist()}")

            # Clean and preprocess text
            df['text'] = df['text'].fillna('')
            df['introtext'] = df['introtext'].fillna('')
            df['title'] = df['title'].fillna('')

            # Create summary from introtext if available, otherwise use first part of text
            df['summary'] = df.apply(
                lambda row: row['introtext'] if row['introtext'] and len(row['introtext']) > 20
                else row['text'][:500] if row['text'] else "متن خبر موجود نیست",
                axis=1
            )

            print(f"Loaded {len(df)} news articles.")
            return df

        except Exception as e:
            print(f"Error loading data: {e}")
            return None

    def generate_title(self, text, max_new_tokens=64, temperature=0.7, top_p=0.9):
        """
        Generate a title for the given text using the model with LoRA adapters.

        Args:
            text (str): Input text (article or summary)
            max_new_tokens (int): Maximum number of tokens to generate
            temperature (float): Sampling temperature
            top_p (float): Nucleus sampling parameter

        Returns:
            str: Generated title
        """
        if not self.model_loaded:
            print("Model not loaded. Loading model...")
            if not self.load_model():
                return "خطا در بارگذاری مدل"

        try:
            # Prepare the prompt
            user_message = f"generate a proper title for this article:\n{text}"
            prompt = self.inference_template.format(
                system_message=self.system_message,
                user_message=user_message
            )

            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Generate title
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            # Decode output
            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract the generated title
            assistant_tag = "<|start_header_id|>assistant<|end_header_id|>"
            if assistant_tag in generated_text:
                title = generated_text.split(assistant_tag)[-1].strip()
            else:
                # Try alternative extraction method
                parts = generated_text.split("assistant")
                if len(parts) > 1:
                    title = parts[-1].strip()
                else:
                    title = generated_text

            # Clean up the title (remove any remaining tags or special tokens)
            title = title.replace("<|eot_id|>", "").strip()
            title = title.split("\n")[0]  # Take only the first line

            return title

        except Exception as e:
            print(f"Error generating title: {e}")
            return "خطا در تولید عنوان"

    def batch_inference(self, df, input_column='summary', batch_size=4):
        """
        Perform batch inference on the entire dataset.

        Args:
            df (pd.DataFrame): DataFrame containing the articles
            input_column (str): Column to use as input ('text' or 'summary')
            batch_size (int): Number of samples to process in each batch

        Returns:
            list: List of generated titles
        """
        print(f"Starting batch inference on {len(df)} articles...")
        print(f"Using '{input_column}' as input")

        generated_titles = []

        # Process in batches
        for i in tqdm(range(0, len(df), batch_size), desc="Generating titles"):
            batch = df.iloc[i:i+batch_size]

            for _, row in batch.iterrows():
                input_text = row[input_column]

                # Skip empty texts
                if not input_text or len(str(input_text).strip()) < 10:
                    generated_titles.append("متن ورودی ناکافی")
                    continue

                # Truncate very long texts to prevent token overflow
                if len(str(input_text)) > 2000:
                    input_text = str(input_text)[:2000]

                # Generate title
                title = self.generate_title(input_text)
                generated_titles.append(title)

        return generated_titles

    def save_results(self, df, generated_titles, output_path):
        """
        Save the results to multiple formats for analysis.

        Args:
            df (pd.DataFrame): Original dataframe
            generated_titles (list): List of generated titles
            output_path (str): Base path for saving results
        """
        print("Saving results...")

        # Add generated titles to dataframe
        df['generated_title'] = generated_titles

        # Create output directory if it doesn't exist
        os.makedirs(output_path, exist_ok=True)

        # Generate timestamp for filenames
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save to Excel
        excel_path = os.path.join(output_path, f'hamshahri_with_titles_{timestamp}.xlsx')
        df.to_excel(excel_path, index=False)
        print(f"Saved Excel file: {excel_path}")

        # Save to CSV
        csv_path = os.path.join(output_path, f'hamshahri_with_titles_{timestamp}.csv')
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        print(f"Saved CSV file: {csv_path}")

        # Save only the generated titles
        titles_path = os.path.join(output_path, f'generated_titles_{timestamp}.txt')
        with open(titles_path, 'w', encoding='utf-8') as f:
            for title in generated_titles:
                f.write(title + '\n')
        print(f"Saved titles file: {titles_path}")

        # Save comparison JSON for analysis
        comparison_data = []
        for i, row in df.iterrows():
            comparison_data.append({
                'id': i,
                'news_id': row.get('news_id', i),
                'original_title': row['title'],
                'generated_title': row['generated_title'],
                'category': row.get('category', 'Unknown'),
                'summary': row['summary'][:200] + '...' if len(row['summary']) > 200 else row['summary']
            })

        json_path = os.path.join(output_path, f'title_comparison_{timestamp}.json')
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(comparison_data, f, ensure_ascii=False, indent=2)
        print(f"Saved comparison JSON: {json_path}")

    def analyze_results(self, df):
        """
        Perform basic analysis on the generated titles.

        Args:
            df (pd.DataFrame): DataFrame with original and generated titles
        """
        print("\n=== Results Analysis ===")

        # Basic statistics
        print(f"Total articles processed: {len(df)}")

        # Average title length
        df['original_title_length'] = df['title'].astype(str).str.len()
        df['generated_title_length'] = df['generated_title'].astype(str).str.len()

        print(f"Average original title length: {df['original_title_length'].mean():.1f} characters")
        print(f"Average generated title length: {df['generated_title_length'].mean():.1f} characters")

        # Sample comparisons
        print("\n=== Sample Title Comparisons ===")
        samples = df.sample(min(5, len(df)))

        for _, row in samples.iterrows():
            print(f"\nNews ID: {row.get('news_id', 'N/A')}")
            print(f"Category: {row.get('category', 'Unknown')}")
            print(f"Original Title: {row['title']}")
            print(f"Generated Title: {row['generated_title']}")
            print("-" * 80)

### Loading and Preprocessing the Hamshahri Dataset

In [39]:
def load_hamshahri_data(self, file_path):
        """
        Load and preprocess the Hamshahri news dataset.

        Args:
            file_path (str): Path to the Excel file containing news data

        Returns:
            pd.DataFrame: Preprocessed dataframe with news articles
        """
        print(f"Loading data from {file_path}...")

        try:
            # Load Excel file
            df = pd.read_excel(file_path)

            # Display dataset information
            print(f"Dataset shape: {df.shape}")
            print(f"Columns: {df.columns.tolist()}")

            # Clean and preprocess text
            df['text'] = df['text'].fillna('')
            df['introtext'] = df['introtext'].fillna('')
            df['title'] = df['title'].fillna('')

            # Create summary from introtext if available, otherwise use first part of text
            df['summary'] = df.apply(
                lambda row: row['introtext'] if row['introtext'] and len(row['introtext']) > 20
                else row['text'][:500] if row['text'] else "متن خبر موجود نیست",
                axis=1
            )

            print(f"Loaded {len(df)} news articles.")
            return df

        except Exception as e:
            print(f"Error loading data: {e}")
            return None

### Generating a Title for a Single Text Input

In [40]:
def generate_title(self, text, max_new_tokens=64, temperature=0.7, top_p=0.9):
        """
        Generate a title for the given text using the model with LoRA adapters.

        Args:
            text (str): Input text (article or summary)
            max_new_tokens (int): Maximum number of tokens to generate
            temperature (float): Sampling temperature
            top_p (float): Nucleus sampling parameter

        Returns:
            str: Generated title
        """
        if not self.model_loaded:
            print("Model not loaded. Loading model...")
            if not self.load_model():
                return "خطا در بارگذاری مدل"

        try:
            # Prepare the prompt
            user_message = f"generate a proper title for this article:\n{text}"
            prompt = self.inference_template.format(
                system_message=self.system_message,
                user_message=user_message
            )

            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Generate title
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            # Decode output
            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract the generated title
            assistant_tag = "<|start_header_id|>assistant<|end_header_id|>"
            if assistant_tag in generated_text:
                title = generated_text.split(assistant_tag)[-1].strip()
            else:
                # Try alternative extraction method
                parts = generated_text.split("assistant")
                if len(parts) > 1:
                    title = parts[-1].strip()
                else:
                    title = generated_text

            # Clean up the title (remove any remaining tags or special tokens)
            title = title.replace("<|eot_id|>", "").strip()
            title = title.split("\n")[0]  # Take only the first line

            return title

        except Exception as e:
            print(f"Error generating title: {e}")
            return "خطا در تولید عنوان"

### Performing Batch Inference on the Dataset

In [41]:
def batch_inference(self, df, input_column='summary', batch_size=4):
        """
        Perform batch inference on the entire dataset.

        Args:
            df (pd.DataFrame): DataFrame containing the articles
            input_column (str): Column to use as input ('text' or 'summary')
            batch_size (int): Number of samples to process in each batch

        Returns:
            list: List of generated titles
        """
        print(f"Starting batch inference on {len(df)} articles...")
        print(f"Using '{input_column}' as input")

        generated_titles = []

        # Process in batches
        for i in tqdm(range(0, len(df), batch_size), desc="Generating titles"):
            batch = df.iloc[i:i+batch_size]

            for _, row in batch.iterrows():
                input_text = row[input_column]

                # Skip empty texts
                if not input_text or len(str(input_text).strip()) < 10:
                    generated_titles.append("متن ورودی ناکافی")
                    continue

                # Truncate very long texts to prevent token overflow
                if len(str(input_text)) > 2000:
                    input_text = str(input_text)[:2000]

                # Generate title
                title = self.generate_title(input_text)
                generated_titles.append(title)

        return generated_titles

### Saving Results to Multiple Formats (Excel, CSV, JSON, TXT)

In [42]:
def save_results(self, df, generated_titles, output_path):
        """
        Save the results to multiple formats for analysis.

        Args:
            df (pd.DataFrame): Original dataframe
            generated_titles (list): List of generated titles
            output_path (str): Base path for saving results
        """
        print("Saving results...")

        # Add generated titles to dataframe
        df['generated_title'] = generated_titles

        # Create output directory if it doesn't exist
        os.makedirs(output_path, exist_ok=True)

        # Generate timestamp for filenames
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save to Excel
        excel_path = os.path.join(output_path, f'hamshahri_with_titles_{timestamp}.xlsx')
        df.to_excel(excel_path, index=False)
        print(f"Saved Excel file: {excel_path}")

        # Save to CSV
        csv_path = os.path.join(output_path, f'hamshahri_with_titles_{timestamp}.csv')
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        print(f"Saved CSV file: {csv_path}")

        # Save only the generated titles
        titles_path = os.path.join(output_path, f'generated_titles_{timestamp}.txt')
        with open(titles_path, 'w', encoding='utf-8') as f:
            for title in generated_titles:
                f.write(title + '\n')
        print(f"Saved titles file: {titles_path}")

        # Save comparison JSON for analysis
        comparison_data = []
        for i, row in df.iterrows():
            comparison_data.append({
                'id': i,
                'news_id': row.get('news_id', i),
                'original_title': row['title'],
                'generated_title': row['generated_title'],
                'category': row.get('category', 'Unknown'),
                'summary': row['summary'][:200] + '...' if len(row['summary']) > 200 else row['summary']
            })

        json_path = os.path.join(output_path, f'title_comparison_{timestamp}.json')
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(comparison_data, f, ensure_ascii=False, indent=2)
        print(f"Saved comparison JSON: {json_path}")

### Basic Analysis of the Generated Results

In [43]:
def analyze_results(self, df):
        """
        Perform basic analysis on the generated titles.

        Args:
            df (pd.DataFrame): DataFrame with original and generated titles
        """
        print("\n=== Results Analysis ===")

        # Basic statistics
        print(f"Total articles processed: {len(df)}")

        # Average title length
        df['original_title_length'] = df['title'].astype(str).str.len()
        df['generated_title_length'] = df['generated_title'].astype(str).str.len()

        print(f"Average original title length: {df['original_title_length'].mean():.1f} characters")
        print(f"Average generated title length: {df['generated_title_length'].mean():.1f} characters")

        # Sample comparisons
        print("\n=== Sample Title Comparisons ===")
        samples = df.sample(min(5, len(df)))

        for _, row in samples.iterrows():
            print(f"\nNews ID: {row.get('news_id', 'N/A')}")
            print(f"Category: {row.get('category', 'Unknown')}")
            print(f"Original Title: {row['title']}")
            print(f"Generated Title: {row['generated_title']}")
            print("-" * 80)

In [44]:
def verify_files():
    """
    Verify that required files exist in the current directory
    """
    required_files = ['adapter_config.json', 'adapter_model.safetensors', 'hamshahri_online_10.xlsx']

    print("Checking for required files...")
    for file in required_files:
        if os.path.exists(file):
            print(f"✓ Found: {file}")
        else:
            print(f"✗ Missing: {file}")
            return False

    print("All required files found!")
    return True

### Main Function: Executing the Full Inference Pipeline

In [None]:
def main():
    """
    Main execution function for the inference pipeline.
    """
    # First authenticate with Hugging Face
    print("=== Hugging Face Authentication ===")
    hf_token = authenticate_huggingface()
    if not hf_token:
        print("Authentication failed. Please check your token and try again.")
        return

    # Verify all required files exist
    if not verify_files():
        print("Please upload all required files to continue.")
        return

    # Configuration
    adapter_path = './'  # Path to your LoRA adapters
    base_model_name = "meta-llama/Meta-Llama-3-8B-Instruct"  # Base model name
    data_path = './hamshahri_online_10.xlsx'  # Path to the Hamshahri dataset
    output_path = './inference_results'  # Output directory

    print("=" * 80)
    print("Persian News Title Generation using Llama 3 with LoRA Adapters")
    print("=" * 80)

    # Initialize inference pipeline with the token
    pipeline = PersianTitleInference(adapter_path, base_model_name, hf_token=hf_token)

    # Load the model with LoRA adapters
    if not pipeline.load_model():
        print("Failed to load model. Exiting...")
        return

    # Load the dataset
    df = pipeline.load_hamshahri_data(data_path)
    if df is None:
        print("Failed to load data. Exiting...")
        return

    # Perform inference
    print("\n" + "=" * 80)
    print("Starting title generation...")
    print("=" * 80)

    start_time = datetime.now()

    # Generate titles using summaries (introtext or first part of article)
    generated_titles = pipeline.batch_inference(df, input_column='summary', batch_size=4)

    end_time = datetime.now()
    print(f"\nInference completed in {end_time - start_time}")

    # Save results
    pipeline.save_results(df, generated_titles, output_path)

    # Analyze results
    pipeline.analyze_results(df)

    print("\n" + "=" * 80)
    print("Pipeline Completed Successfully")
    print("=" * 80)

if __name__ == "__main__":
    main()