# LlaMa summarisation

Since the task explicitly mentions LlaMa, here is a solution with Llama. The chosen model is the smallest in the 3.2 group to be able to run on my laptop. 

#### Without LangChain: 

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import requests
from bs4 import BeautifulSoup

# Step 1: Scrape Text from a Website
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    text = ' '.join(paragraphs)
    return text

# Step 2: Generate Summary and Title using LLaMA 3.2-1B
def summarize_and_title(url):
    text = scrape_website(url)

    # Truncate and clean input text
    max_input_length = 1024  # Adjust for token limit
    text = text[:max_input_length]

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
    llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

    # Generate summary
    summary_prompt = (
        "Summarize the following text in clear, concise sentences (max 150 words). Focus on the main points:\n\n"
        f"{text}\n\n"
        "Summary:"
    )
    summary_output = llama_pipeline(
        summary_prompt,
        max_new_tokens=200,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    raw_summary = summary_output[0]["generated_text"]
    summary = raw_summary.split("Summary:")[-1].strip().split("\n")[0]  # Extract the first line of the summary

    # Generate title
    title_prompt = (
        f"Write a short, engaging title based on the following summary:\n\n"
        f"{summary}\n\n"
        "Title:"
    )
    title_output = llama_pipeline(
        title_prompt,
        max_new_tokens=20,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    raw_title = title_output[0]["generated_text"]
    title = raw_title.split("Title:")[-1].strip().split("\n")[0]  # Extract the first line of the title

    return summary, title

# Example Usage
if __name__ == "__main__":
    url = "https://www.wix.com/encyclopedia/definition/artificial-intelligence"
    summary, title = summarize_and_title(url)
    print(f"Title: {title}\nSummary: {summary}")

Device set to use mps:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Title: How to build a chatbot for a website
Summary: AI is a branch of computer science that develops machine systems capable of demonstrating behaviors linked to human intelligence. AI programs use data collected from different interactions to improve the way they mimic humans in order to perform tasks such as learning, planning, knowledge representation, perception and problem-solving. Artificial intelligence technology is used for a wide range of applications, including in web development, such as automated chatbots for customer service, product recommendations based on a user’s habits, speech recognition, and even to build a website from scratch.


#### With LangChain: 

In [5]:
from bs4 import BeautifulSoup
import requests
import textwrap
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_huggingface import HuggingFacePipeline

# Step 1: Scrape Text from a Website
def scrape_website(url, max_chars=4000):
    """Extracts paragraph text from a webpage and truncates it to a reasonable length."""
    response = requests.get(url)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch webpage. Status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    text = ' '.join(paragraphs)
    return text[:max_chars] if len(text) > max_chars else text

# Step 2: Chunking for Large Texts
def chunk_text(text, max_tokens=500):
    """Splits long text into smaller chunks while maintaining coherence."""
    return textwrap.wrap(text, width=max_tokens)

# Step 3: Setup LangChain Summarization Model
def get_summarization_chain():
    """Creates a LangChain summarization chain using Hugging Face pipeline."""
    summarization_pipeline = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        framework="pt"
    )
    summarization_llm = HuggingFacePipeline(pipeline=summarization_pipeline)

    summarization_prompt = PromptTemplate(
        input_variables=["text"],
        template="Summarize the following text:\n\n{text}\n\nSummary:"
    )

    return LLMChain(llm=summarization_llm, prompt=summarization_prompt)

# Step 4: Setup LangChain Title Generation Model
def get_title_chain():
    """Creates a LangChain title generation chain using Hugging Face pipeline."""
    title_pipeline = pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        framework="pt"
    )
    title_llm = HuggingFacePipeline(pipeline=title_pipeline)

    title_prompt = PromptTemplate(
        input_variables=["summary"],
        template="Generate a short, engaging, and clear title for the following summary:\n\n{summary}\n\nTitle:"
    )

    return LLMChain(llm=title_llm, prompt=title_prompt)

# Step 5: Generate Summary with LangChain
def summarize_large_text(text):
    """Summarizes long text in chunks and then summarizes the combined result."""
    summarization_chain = get_summarization_chain()
    chunks = chunk_text(text)
    
    summaries = [summarization_chain.run({"text": chunk}).strip() for chunk in chunks]

    # If multiple summaries, summarize them again
    if len(summaries) > 1:
        combined_text = " ".join(summaries)
        final_summary = summarization_chain.run({"text": combined_text}).strip()
    else:
        final_summary = summaries[0]

    return final_summary

# Step 6: Generate Title
def generate_title(summary):
    """Creates a short, engaging title using LangChain."""
    title_chain = get_title_chain()
    raw_title = title_chain.run({"summary": summary}).strip()

    # Clean title: limit to 10 words max
    words = raw_title.split()
    clean_title = " ".join(words[:10]) if len(words) > 10 else raw_title
    return clean_title.capitalize()

# Step 7: Main Function
def summarize_and_title(url):
    """Extracts text from a URL, summarizes it using LangChain, and generates a title."""
    try:
        text = scrape_website(url)
        summary = summarize_large_text(text)
        title = generate_title(summary)

        return title, summary
    except Exception as e:
        return f"Error: {e}", ""

# Example Usage
if __name__ == "__main__":
    url = "https://www.wix.com/encyclopedia/definition/artificial-intelligence"
    title, summary = summarize_and_title(url)
    print(f"Title: {title}\nSummary: {summary}")

Device set to use mps:0
  return LLMChain(llm=summarization_llm, prompt=summarization_prompt)
  summaries = [summarization_chain.run({"text": chunk}).strip() for chunk in chunks]
Your max_length is set to 142, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 142, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 142, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 142, but your input_length is only 114. Since

Title: Ai: a new tool for web development
Summary: Artificial intelligence is a branch of computer science that develops machine systems capable of demonstrating behaviors linked to human intelligence. AI technology is used for a wide range of applications, including inweb development. It can be used to create chatbots for customer service, product recommendations based on user’s habits, speech recognition, and even tobuild a website from scratch.
