## 1. Define function to get website content

In [12]:
import requests
from bs4 import BeautifulSoup

def get_website_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the div containing article content
    try:
        target_div = soup.find_all('div', class_='elementor-widget-theme-post-content')[-1]
        paragraphs = target_div.find_all('p')

        content = ''
        for paragraph in paragraphs:
            content += paragraph.get_text() + ' '
        return content

    except:
        return ''


## 2. Extract Q&A from website to create a dataset

- Website: https://thelawdictionary.org/
- Dataset structure: id, question, answer, category, ref_link

In [None]:
import bs4 as bs
import urllib.request
import pandas as pd
import random

# Initialize dataset
data = {'id': [], 'question': [], 'answer': [], 'category': [], 'ref_link':[]}

# Initialize index and page urls
idx = 0
link = "https://thelawdictionary.org/article/page/{i}/"

# Extract information from each url
for i in range(1, 31):
    # Read html content
    link_page = link.format(i=i)
    html_source = urllib.request.urlopen(link_page)
    html_source = html_source.read()

    soup = bs.BeautifulSoup(html_source, 'html.parser')

    # Extract question (title), answer (website content), category and reference link from each source
    articles = soup.find_all('article')
    for article in articles:
        question = article.find('h2').text.strip()
        url_answer = article.find('a')['href']
        category = article['class'][-1].replace('category-', '')
        url_category = f'https://thelawdictionary.org/letter/{category}/'

        # Get answer (website content)
        website_content = get_website_content(url_answer)

        # Add extracted information to the dataset
        if website_content != '':
            data['id'].append(idx)
            data['question'].append(question)
            data['answer'].append(website_content)
            data['category'].append(category)
            data['ref_link'] = url_answer
            idx += 1

# Create DataFrame
df = pd.DataFrame(data)

# Save to csv file
df.to_csv('data/raw_legal_qa_lawdictionary.csv', index=False)

## 3. Summarize website contents using facebook/bart-large-cnn model of Hugging Face
This part should be run using GPU.

### 3.1. Function to split long text to smaller texts

In [1]:
def split_text_into_chunks(text, words_per_chunk=150):
    words = text.split()
    text_chunks = [" ".join(words[i:i+words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
    return text_chunks

### 3.2. Function to summarize text using a pre-trained model

In [2]:
def summarize_text(text, summarizer):
  # Split into smaller texts
  text_chunks = split_text_into_chunks(text)

  # Summarize each chunk and form a new paragraph
  summarized = summarizer(text_chunks, min_length=20, max_length=30, truncation=True)
  list_summarized_texts = [element['summary_text'] for element in summarized]

  return " ".join(list_summarized_texts)

### 3.3. Summarize website content

In [9]:
from transformers import pipeline

# Initialize the HuggingFace summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
import pandas as pd

# Load the dataframe
df = pd.read_csv('data/raw_legal_qa_lawdictionary.csv')

# Save summarized answers into a new column
df['short_answer'] = df['answer'].apply(lambda x: summarize_text(x, summarizer))
df.to_csv('data/raw_legal_qa_lawdictionary.csv', index=False)

Your max_length is set to 30, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 30, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 30, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 30, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_len