# LangChain Library

Since the task mentions LangChain, I will do the taks with LangChain too. Again, first, I try with 1024 tokens and a short summary to see how it works. 

In [2]:
from bs4 import BeautifulSoup
import requests
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain.schema.runnable import RunnableLambda

# Step 1: Scrape Text from a Website
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    text = ' '.join(paragraphs)  # Extract only paragraph text
    return text

# Step 2: Setup Summarization Model
def get_summarization_chain():
    # Load the summarization pipeline from Hugging Face
    summarization_pipeline = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        framework="pt"
    )

    # Wrap the Hugging Face model inside a LangChain-compatible pipeline
    summarization_llm = HuggingFacePipeline(pipeline=summarization_pipeline)

    # Define the prompt template
    summarization_prompt = PromptTemplate.from_template(
        "Summarize the following text in a clear and concise way:\n\n{text}\n\nSummary:"
    )

    # Use RunnableLambda to make the pipeline compatible
    return RunnableLambda(lambda inputs: summarization_llm.invoke(inputs["text"]))

# Step 3: Generate Summary
def summarize_website(url):
    text = scrape_website(url)

    # Truncate the text if it's too long
    max_input_length = 1024
    text = text[:max_input_length]

    # Get the summarization chain
    summarization_chain = get_summarization_chain()

    # Generate the summary using invoke()
    summary = summarization_chain.invoke({"text": text})

    return summary

# Example Usage
if __name__ == "__main__":
    url = "https://www.wix.com/encyclopedia/definition/artificial-intelligence"
    summary = summarize_website(url)
    print(f"Summary:\n{summary}")

Device set to use mps:0


Summary:
Artificial intelligence is a branch of computer science that develops machine systems capable of demonstrating behaviors linked to human intelligence. AI programs use data collected from different interactions to improve the way they mimic humans in order to perform tasks such as learning, planning, knowledge representation, perception and problem-solving.


The summary is not great but the model is doing good so I add the title, chunck the text and make the summary longer. 

### Revised model: 

In [6]:
from bs4 import BeautifulSoup
import requests
import textwrap
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain.schema.runnable import RunnableLambda

# Step 1: Scrape Text from a Website
def scrape_website(url, max_chars=4000):
    """Extracts paragraph text from a webpage and truncates it to a reasonable length."""
    response = requests.get(url)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch webpage. Status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    text = ' '.join(paragraphs)
    return text[:max_chars] if len(text) > max_chars else text

# Step 2: Chunking for Large Texts
def chunk_text(text, max_tokens=500):
    """Splits long text into smaller chunks to fit model constraints."""
    return textwrap.wrap(text, width=max_tokens)

# Step 3: Setup LangChain Summarization Model
def get_summarization_chain():
    """Loads the summarization pipeline wrapped inside LangChain."""
    summarization_pipeline = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        framework="pt"
    )
    summarization_llm = HuggingFacePipeline(pipeline=summarization_pipeline)
    return RunnableLambda(lambda inputs: summarization_llm.invoke(inputs["text"]))

# Step 4: Setup LangChain Title Generation Model (Fixing Issue)
def get_title_chain():
    """Loads the title generator pipeline wrapped inside LangChain."""
    title_pipeline = pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        framework="pt"
    )
    title_llm = HuggingFacePipeline(pipeline=title_pipeline)

    return RunnableLambda(lambda inputs: title_llm.invoke(inputs["text"]))

# Step 5: Generate Summary with LangChain
def summarize_large_text(text):
    """Summarizes long text in chunks and then summarizes the combined result."""
    summarization_chain = get_summarization_chain()
    chunks = chunk_text(text)
    
    summaries = []
    for chunk in chunks:
        summary = summarization_chain.invoke({"text": chunk}).strip()
        summaries.append(summary)

    # If there are multiple summaries, summarize them again
    if len(summaries) > 1:
        combined_text = " ".join(summaries)
        final_summary = summarization_chain.invoke({"text": combined_text}).strip()
    else:
        final_summary = summaries[0]

    return final_summary

# Step 6: Generate Title with Fixes
def generate_title(summary):
    """Creates a short, engaging title using LangChain with better formatting."""
    title_chain = get_title_chain()

    # Enforce a shorter title by limiting length & making the instruction explicit
    title_prompt = f"Generate a **short, engaging, and clear title** for the following summary:\n{summary}\n\nTitle:"
    raw_title = title_chain.invoke({"text": title_prompt}).strip()

    # Clean the title by extracting only the first generated line
    clean_title = raw_title.split("\n")[0]  # Take the first line only

    # Further clean long titles (ensuring a concise length)
    words = clean_title.split()
    if len(words) > 10:  # Limit to 10 words max
        clean_title = " ".join(words[:10])

    return clean_title.capitalize()

# Step 7: Main Function
def summarize_and_title(url):
    """Extracts text from a URL, summarizes it using LangChain, and generates a title."""
    try:
        text = scrape_website(url)
        summary = summarize_large_text(text)
        title = generate_title(summary)

        return title, summary
    except Exception as e:
        return f"Error: {e}", ""

# Example Usage
if __name__ == "__main__":
    url = "https://www.wix.com/encyclopedia/definition/artificial-intelligence"
    title, summary = summarize_and_title(url)
    print(f"Title: {title}\nSummary: {summary}")

Device set to use mps:0
Your max_length is set to 142, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 142, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 142, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 142, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('

Title: Artificial intelligence: a new way to improve our systems
Summary: Artificial intelligence is a branch of computer science that develops machine systems capable of demonstrating behaviors linked to human intelligence. The purpose of AI is to improve the systems we already use by automating tasks to make them more efficient. Technology is used for a wide range of applications, including inweb development, chatbots for customer service, product recommendations based on user’s habits, speech recognition.
