In [None]:
#%pip install accelerate
#!pip install -r requirements.txt

In [None]:
#!pip install transformers torch accelerate datasets pandas

In [None]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import torch
import re
from typing import List, Dict
import os
import pandas as pd
from datasets import Dataset


class NumericHeadlineAnalyzer:
    def __init__(self, token: str):
        """
        Initialize the analyzer with HuggingFace token and Llama model
        
        Args:
            token: HuggingFace API token
        """
        self.model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
        # Set the token for model access
        self.token = token
        os.environ["HUGGINGFACE_TOKEN"] = token
        os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizer parallelism

        # Initialize with token authentication
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            token=self.token  # Updated from use_auth_token to token
        )

        # Initialize the model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            token=self.token,  # Updated from use_auth_token to token
            device_map="auto",  # Automatically handle GPU if available
            torch_dtype=torch.float16  # Use half precision to save memory
        )

        # Create generation pipeline
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device_map="auto",
            max_new_tokens=50
        )

    def extract_numbers(self, text: str) -> List[str]:
        """Extract all numbers and numeric expressions from text"""
        number_pattern = r'\d+\.?\d*%?|\$\d+\.?\d*|\d+(?:st|nd|rd|th)'
        return re.findall(number_pattern, text)

    def generate_numeric_headline(self, article_text: str) -> str:
        """Generate a headline with emphasis on including numeric content"""
        # Create a prompt that emphasizes including numbers in the headline
        numbers = self.extract_numbers(article_text)
        prompt = f"""Generate a concise news headline from this article, including relevant numbers if present:

Article: {article_text}

Headline:"""
        # Generate headline using the model
        response = self.generator(
            prompt,
            max_new_tokens=50,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )[0]['generated_text']

        # Extract the headline from the response (everything after "Headline:")
        headline = response.split("Headline:")[-1].strip()

        return headline

    def analyze_article(self, article: str) -> Dict:
        """Analyze a single article"""
        headline = self.generate_numeric_headline(article)
        numbers = self.extract_numbers(article)

        return {
            "generated_headline": headline,
            "contains_numbers": bool(numbers),
            "extracted_numbers": numbers,
            "original_text": article
        }


# Your HuggingFace token
HUGGINGFACE_TOKEN = "#TOKEN"


def main():
    try:
        # Create analyzer with token
        analyzer = NumericHeadlineAnalyzer(token=HUGGINGFACE_TOKEN)

        # Load the dataset
        data = pd.read_csv("NumHG-main/Dataset/fold-1/test.csv")
        test_articles = data["text"]

        # Create a Dataset object
        dataset = Dataset.from_pandas(data)

        # Define a processing function for batch processing
        def process_articles(batch):
            articles = batch["text"]
            headlines = []
            contains_numbers = []
            extracted_numbers = []
            for article in articles:
                headline = analyzer.generate_numeric_headline(article)
                numbers = analyzer.extract_numbers(article)
                headlines.append(headline)
                contains_numbers.append(bool(numbers))
                extracted_numbers.append(numbers)

            return {
                "generated_headline": headlines,
                "contains_numbers": contains_numbers,
                "extracted_numbers": extracted_numbers
            }

        # Process the dataset in batches
        processed_dataset = dataset.map(process_articles, batched=True, batch_size=8)

        # Convert the processed dataset to a DataFrame
        processed_df = processed_dataset.to_pandas()

        # Save the results to a text file
        processed_df[["generated_headline", "contains_numbers", "extracted_numbers"]].to_csv(
            "results.csv", index=False
        )

        print("Headline generation complete. Results saved to 'results.csv'.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("\nTroubleshooting steps:")
        print("1. Verify your HuggingFace token has access to meta-llama models.")
        print("2. Check your internet connection.")
        print("3. Ensure you have enough GPU memory (or try running on CPU).")
        print("4. Make sure all required packages are installed:")
        print("   pip install transformers torch accelerate datasets")


if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/5549 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
!git clone https://github.com/AIPHES/emnlp19-moverscore.git
cd emnlp19-moverscore/
python setup.py install

In [None]:
python numhg_eval.py \--tgt_path=target_path \--pre_path=predict_result_path \--num_gt_path=numeral_ground_truth_path \--num_type_path=numeral_type_path