# Scraping de Questions et Réponses sur https://answers.justia.com

Ce script extrait les URL de chaque catégorie, scrape les Q&A (entre utilisateurs et avocats) de 4 premieres pages, puis enrefistrer les données dans un fichier CSV.

In [10]:
import requests
from bs4 import BeautifulSoup

# Function to extract category URLs from Justia
def get_category_links_from_website(url):
    # Read HTML content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    try:
        # Find the <ul> containing category links
        target_ul = soup.find_all('ul', class_='list-columns list-no-styles list-columns-two list-columns-three')[1]
        # Extract URLs
        anchors = target_ul.find_all('a', href=True)
        links = [a['href'] for a in anchors]
        return links
    except:
        return []

In [11]:
# URL of the website
url='https://answers.justia.com'

# Get category URLs
category_links = get_category_links_from_website(url)
category_links = [url + link for link in category_links]

len(category_links) # 63

63

In [12]:
# Function to extract question URLs from each category
def get_questions_from_website(url):
    # Read category URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    try:
        # Find the div containing question links
        target_divs = soup.find_all('div', class_='question-wrapper has-padding-content-block-30 has-negative-sides-30 clearfix')
        # Extract links
        questions = [div.find('strong', class_='heading-3 has-no-top-margin -question text-soft-wrap') for div in target_divs]
        links = [question.find('a', href=True)['href'] for question in questions]
        return links
    except:
        return links

In [14]:
# Apply the function to extract Q&A of the first 4 pages
question_links = []

for cat_link in category_links:
    for i in range(1, 5):
        page_link = cat_link + f'?page={i}'
        questions = get_questions_from_website(page_link)
        question_links += [url + question for question in questions]

# Remove duplicate links
question_links = list(set(question_links))

len(question_links) # 3806

3806

In [15]:
# Function to extract question and lawyer's answer from each question page
def get_question_answer_from_website(url):
    # Read page content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    question = ''
    answer = ''
    try:
        # Find the div containing question and answer
        target_div = soup.find('div', class_='primary-content-wrapper')

        # Extract question (including question title and question details)
        q_header = target_div.find('h1').get_text()
        q_paragraphs = target_div.find('div', class_='question-details to-xlarge-font text-soft-wrap').find_all('p')
        q_content = ' '.join(paragraph.get_text() for paragraph in q_paragraphs)
        raw_question = f"{q_header}. {q_content}"

        # Extract answer (only get the first answer)
        a_paragraphs = target_div.find('div', class_='answer-detailed-text to-xlarge-font text-soft-wrap').find_all('p')
        raw_answer = ' '.join(paragraph.get_text() for paragraph in a_paragraphs)

        # Remove \n and \t from the texts
        question = raw_question.replace('\n', '').replace('\t', '')
        answer = raw_answer.replace('\n', '').replace('\t', '')

        return question, answer

    except:
        return question, answer


In [17]:
import pandas as pd

# Initlize dataset
data = {'question': [], 'answer': []}

# Get questions and answers from question links
for q_link in question_links:
    question, answer = get_question_answer_from_website(q_link)
    data['question'].append(question)
    data['answer'].append(answer)

# Create a dataFrame
df = pd.DataFrame(data)

# Remove empty values
df = df.replace('', pd.NA).dropna()

# Save to csv file
df.to_csv('data/legal_qa_justia.csv', index=False)