In [10]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_url(url):
    """
    Scrape the given HiNative question URL and extract original and corrected phrases.
    
    Args:
    url (str): The HiNative question URL to scrape.

    Returns:
    None
    """
    # Send a GET request to the URL and parse the HTML content using BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the title tag
    title_tag = soup.find('title')
    
    # Check if the title_tag is not None
    if title_tag is not None:
        # Extract and clean the question title
        question_title = title_tag.string.strip()

        # Find all text_correction_answer divs, which contain the original and corrected responses
        responses = soup.find_all('div', class_='text_correction_answer')

        # Loop through the responses
        for response in responses:
            # Find all paragraphs within the response
            original_element = response.find_all('p')

            # Loop through the paragraphs with a step of 2, starting from the first one
            for i in range(1, len(original_element), 2):
                element = original_element[i]

                # Check if the paragraph contains del and/or ins tags
                if element.find('del') or element.find('ins'):
                    # Extract the original phrase from the paragraph
                    original_phrase = ''
                    for child in element.children:
                        if child.name == 'del':
                            original_phrase += child.text
                        elif child.name == 'ins':
                            continue
                        else:
                            original_phrase += child.string

                    # Extract the corrected phrase from the paragraph
                    corrected_element = original_element[i] 
                    corrected_phrase = ''
                    for child in corrected_element.children:
                        if child.name == 'ins':
                            corrected_phrase += child.text
                        elif child.name == 'del':
                            continue
                        else:
                            corrected_phrase += child.string

                    # Write the URL, original and corrected phrases to the CSV file
                    with open('learner_corpus.csv', 'a', newline='') as file:
                        writer = csv.writer(file)
                        writer.writerow([url, original_phrase.strip(), corrected_phrase.strip()])
    else:
        print(f"Title tag not found for URL: {url}")

# Read the URLs from the 'urls.txt' file
with open('urls.txt', 'r') as urls_file:
    urls = urls_file.readlines()

# Remove newline characters from the URLs
urls = [url.strip() for url in urls]

# Write the header row to the CSV file
with open('learner_corpus.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Original', 'Corrected'])

# Scrape each URL
for url in urls:
    scrape_url(url)


Title tag not found for URL: https://hinative.com/questions/23867879
Title tag not found for URL: https://hinative.com/questions/23867848
Title tag not found for URL: https://hinative.com/questions/23867825
Title tag not found for URL: https://hinative.com/questions/23867821
Title tag not found for URL: https://hinative.com/questions/23879370
