In [1]:
import requests
from bs4 import BeautifulSoup

def get_category_links_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    # Find the ul containing category links
    try:
        target_ul = soup.find_all('ul', class_='list-columns list-no-styles list-columns-two list-columns-three')[1]
        anchors = target_ul.find_all('a', href=True)
        links = [a['href'] for a in anchors]
        return links

    except:
        return []

In [2]:
url='https://answers.justia.com'
category_links = get_category_links_from_website(url)
category_links = [url + link for link in category_links]
len(category_links)

63

In [3]:
def get_questions_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    try:
        target_divs = soup.find_all('div', class_='question-wrapper has-padding-content-block-30 has-negative-sides-30 clearfix')
        questions = [div.find('strong', class_='heading-3 has-no-top-margin -question text-soft-wrap') for div in target_divs]
        links = [question.find('a', href=True)['href'] for question in questions]
        return links

    except:
        return links

In [4]:
question_links = []
for cat_link in category_links:
    questions = get_questions_from_website(cat_link)
    question_links += [url + question for question in questions]
question_links = list(set(question_links))
len(question_links)

932

In [6]:
def get_question_answer_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    question = ''
    answer = ''
    try:
        target_div = soup.find('div', class_='primary-content-wrapper')

        q_header = target_div.find('h1').get_text()
        q_paragraphs = target_div.find('div', class_='question-details to-xlarge-font text-soft-wrap').find_all('p')
        q_content = ' '.join(paragraph.get_text() for paragraph in q_paragraphs)
        raw_question = f"{q_header}. {q_content}"

        a_paragraphs = target_div.find('div', class_='answer-detailed-text to-xlarge-font text-soft-wrap').find_all('p')
        raw_answer = ' '.join(paragraph.get_text() for paragraph in a_paragraphs)

        question = raw_question.replace('\n', '').replace('\t', '')
        answer = raw_answer.replace('\n', '').replace('\t', '')

        return question, answer

    except:
        return question, answer


In [8]:
import pandas as pd

data = {'question': [], 'answer': []}

for q_link in question_links:
    question, answer = get_question_answer_from_website(q_link)
    data['question'].append(question)
    data['answer'].append(answer)

# Create DataFrame
df = pd.DataFrame(data)

# Remove empty values
df = df.replace('', pd.NA).dropna()

# Save to csv file
df.to_csv('data/legal_qa_justia.csv', index=False)