<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 3.1 - eyamrog

The aim of this phase is to develop solutions to scrape text from each journal's article HTML page.

## Required Python packages

- beautifulsoup4
- lxml
- pandas
- requests
- selenium
- tqdm

## Import the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import sys
import time
import logging
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Define input variables

In [2]:
input_directory = 'cl_st2_ph2_eyamrog'
output_directory = 'cl_st2_ph31_eyamrog'

## Create output directory

In [None]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

### Create output subdirectories

In [3]:
def create_directory(path):
    """Creates a subdirectory if it doesn't exist."""
    if not os.path.exists(path):
        try:
            os.makedirs(path)
            print(f"Successfully created the directory: {path}")
        except OSError as e:
            print(f"Failed to create the {path} directory: {e}")
            sys.exit(1)
    else:
        print(f"Directory already exists: {path}")

## Set up logging

In [4]:
log_filename = f"{output_directory}/{output_directory}.log"

In [5]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_filename
)

## Health Sciences

### [Nature Medicine](https://www.nature.com/nm/)

#### Create output subdirectory

In [None]:
# 'Nature Medicine'
id = 'natm'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_nature_medicine_open_access = pd.read_json(f"{input_directory}/nature_medicine_open_access.jsonl", lines=True)

In [None]:
df_nature_medicine_open_access['Published'] = pd.to_datetime(df_nature_medicine_open_access['Published'], unit='ms')

In [None]:
df_nature_medicine_open_access = df_nature_medicine_open_access.loc[:4]

In [None]:
natm_urls = df_nature_medicine_open_access['URL'].tolist()
natm_urls

In [None]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Web Scraping - Begin

        # Extract the 'Title'
        title = soup.find('h1', class_='c-article-title')
        if title:
            title_text = ' '.join(title.get_text(' ', strip=True).split())
            text += f"Title: {title_text}\n\n"

        # Extract the 'Abstract'
        abstract_section = soup.find('div', id='Abs1-section')
        if abstract_section:
            abstract_h2_title = abstract_section.find('h2', class_='c-article-section__title')
            if abstract_h2_title:
                abstract_h2_title_text = ' '.join(abstract_h2_title.get_text(' ', strip=True).split())
                text += f"\nAbstract: {abstract_h2_title_text}\n\n"
            abstract_content = abstract_section.find('div', class_='c-article-section__content')
            if abstract_content:
                for paragraph in abstract_content.find_all('p', recursive=False):
                    # Remove <sup> elements containing references
                    for sup_tag in paragraph.find_all('sup'):
                        sup_tag.decompose()
                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Extract the 'main content'
        main_content = soup.find('div', class_='main-content')
        if main_content:
            for main_content_section in main_content.find_all('section', recursive=False):
                # Extract sections
                for section in main_content_section.find_all('div', class_='c-article-section', recursive=False):
                    # Extract section title
                    section_h2_title = section.find('h2')
                    if section_h2_title:
                        section_h2_title_text = ' '.join(section_h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {section_h2_title_text}\n\n"
                    # Extract section content
                    section_content = section.find('div', class_='c-article-section__content')
                    if section_content:
                        #for content in section_content.find_all(['h3', 'h4', 'h5', 'p'], recursive=False):
                        for content in section_content.find_all(['p'], recursive=False):
                            # Remove <sup> elements containing references
                            for sup_tag in content.find_all('sup'):
                                sup_tag.decompose()
                            # Extract the content text
                            content_text = ' '.join(content.get_text(' ', strip=True).split())
                            text += f"{content_text}\n"

        # Extract the 'u-mt-32'
        u_mt_32 = soup.find('div', class_='u-mt-32')
        if u_mt_32:
            data_availability_section = u_mt_32.find('section', attrs={'data-title': 'Data availability'})
            if data_availability_section:
                for section in data_availability_section.find_all('div', class_='c-article-section', recursive=False):
                    # Extract section title
                    section_h2_title = section.find('h2')
                    if section_h2_title:
                        section_h2_title_text = ' '.join(section_h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {section_h2_title_text}\n\n"
                    # Extract section content
                    section_content = section.find('div', class_='c-article-section__content')
                    if section_content:
                        for content in section_content.find_all('p', recursive=False):
                            # Remove <sup> elements containing references
                            for sup_tag in content.find_all('sup'):
                                sup_tag.decompose()
                            # Extract the content text
                            content_text = ' '.join(content.get_text(' ', strip=True).split())
                            text += f"{content_text}\n"

            acknowledgements_section = u_mt_32.find('section', attrs={'data-title': 'Acknowledgements'})
            if acknowledgements_section:
                for section in acknowledgements_section.find_all('div', class_='c-article-section', recursive=False):
                    # Extract section title
                    section_h2_title = section.find('h2')
                    if section_h2_title:
                        section_h2_title_text = ' '.join(section_h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {section_h2_title_text}\n\n"
                    # Extract section content
                    section_content = section.find('div', class_='c-article-section__content')
                    if section_content:
                        for content in section_content.find_all('p', recursive=False):
                            # Remove <sup> elements containing references
                            for sup_tag in content.find_all('sup'):
                                sup_tag.decompose()
                            # Extract the content text
                            content_text = ' '.join(content.get_text(' ', strip=True).split())
                            text += f"{content_text}\n"

        # Web Scraping - End

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [None]:
extract_text(df_nature_medicine_open_access, path)

### [Annual Review of Public Health](https://www.annualreviews.org/content/journals/publhealth)

#### Create output subdirectory

In [17]:
# 'Annual Review of Public Health'
id = 'arph'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\arph


#### Import the data into a DataFrame

In [18]:
df_ar_public_health = pd.read_json(f"{input_directory}/ar_public_health.jsonl", lines=True)

In [19]:
df_ar_public_health['Published'] = pd.to_datetime(df_ar_public_health['Published'], unit='ms')

In [20]:
df_ar_public_health = df_ar_public_health.loc[:4]

In [21]:
arph_urls = df_ar_public_health['URL'].tolist()
arph_urls

['https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-121019-053834',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-051920-114020',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-012420-105104',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-051920-110928',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-060220-042648']

In [22]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the 'Title'
        title_tag = soup.find('span', class_='article-title')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"Title: {title}\n\n"

        # Extract 'article sections'
        for section in soup.find_all('div', class_='articleSection'): # Finds all 'div.articleSection' elements (both top-level and nested).
            if not section.find_parent('div', class_='articleSection'): # Keeps only the top-level sections because it filters out nested ones by checking if the 'div.articleSection' has a parent that is also 'div.articleSection'
                
                # Extract section title
                section_title_tag = section.find('div', class_='tl-main-part title')
                if section_title_tag:
                    section_title = ' '.join(section_title_tag.get_text(' ', strip=True).split())
                    text += f"\nSection: {section_title}\n\n"

                # Extract paragraphs (only from top-level sections)
                for paragraph in section.find_all('p'):
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

                # Extract subsections
                for subsection in section.find_all('div', recursive=False):
                    #label_tag = subsection.find('span', class_='label')
                    #if label_tag:
                    #    label_text = ' '.join(label_tag.get_text(' ', strip=True).split())
                    #    text += f"\nSection: {label_text} "
                    #subsection_tag = subsection.find('span', class_='tl-lowest-section')
                    #if subsection_tag:
                    #    subsection_title = ' '.join(subsection_tag.get_text(' ', strip=True).split())
                    #    text += f"{subsection_title}\n\n"
                    for paragraph in subsection.find_all('p'):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [23]:
extract_text(df_ar_public_health, path)

### [Lancet Public Health](https://www.thelancet.com/journals/lanpub/home)

#### Create output subdirectory

In [None]:
# 'Lancet Public Health'
id = 'laph'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_lancet_public_health_open_access = pd.read_json(f"{input_directory}/lancet_public_health_open_access.jsonl", lines=True)

In [None]:
df_lancet_public_health_open_access['Published'] = pd.to_datetime(df_lancet_public_health_open_access['Published'], unit='ms')

In [None]:
df_lancet_public_health_open_access = df_lancet_public_health_open_access.loc[:4]

In [None]:
laph_urls = df_lancet_public_health_open_access['URL'].tolist()
laph_urls

In [None]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the 'Title'
        title_tag = soup.find('h1', property='name')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"Title: {title}\n\n"

        # Extract the 'Abstract'
        abstract_section = soup.find('section', property='abstract')
        if abstract_section:
            abstract_tag = abstract_section.find('h2', property='name')
            if abstract_tag:
                abstract = ' '.join(abstract_tag.get_text(' ', strip=True).split())
                text += f"\nAbstract: {abstract}\n\n"

            for section_h3 in abstract_section.find_all('section', recursive=False):
                section_h3_title_tag = section_h3.find('h3')
                if section_h3_title_tag:
                    section_h3_title = ' '.join(section_h3_title_tag.get_text(' ', strip=True).split())
                    text += f"\nSection: {section_h3_title}\n\n"

                # Extract paragraphs within each section
                paragraphs = section_h3.find_all('div', role='paragraph', recursive=False)
                for paragraph in paragraphs:
                    # Remove reference citations embedded in <span> tags
                    for ref_tag in paragraph.find_all('span', class_='dropBlock reference-citations'):
                        ref_tag.decompose()

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Extract the 'article body'
        body_section = soup.find('section', property='articleBody')
        if body_section:
            body_section_core_container = body_section.find('div', class_='core-container')
            if body_section_core_container:
                # Extract sectioned content
                for section_h2 in body_section_core_container.find_all('section', recursive=False):
                    section_text = ''  # Reset for each section

                    # Extract section title (h2)
                    section_h2_title_tag = section_h2.find('h2')
                    if section_h2_title_tag:
                        section_h2_title = ' '.join(section_h2_title_tag.get_text(' ', strip=True).split())
                        section_text += f"\nSection: {section_h2_title}\n\n"
                    
                    # Extract h2 paragraphs, if there are any
                    paragraphs = section_h2.find_all('div', role='paragraph', recursive=False)
                    for paragraph in paragraphs:
                        # Remove nested paragraphs in the paragraph to drop the paragraphs in the 'Research in context' box
                        for nested_paragraph in paragraph.find_all('div', role='paragraph'):
                            nested_paragraph.decompose()
                        # Remove reference citations embedded in <span> tags
                        for ref_tag in paragraph.find_all('span', class_='dropBlock reference-citations'):
                            ref_tag.decompose()

                        # Extract the paragraph text
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        section_text += f"{paragraph_text}\n"

                    for section_h3 in section_h2.find_all('section'):
                        ## Extract subsection title (h3)
                        #section_h3_title_tag = section_h3.find('h3')
                        #if section_h3_title_tag:
                        #    section_h3_title = ' '.join(section_h3_title_tag.get_text(' ', strip=True).split())
                        #    section_text += f"\nSection: {section_h3_title}\n\n"

                        # Extract h3 paragraphs
                        paragraphs = section_h3.find_all('div', role='paragraph', recursive=False)
                        for paragraph in paragraphs:
                            # Remove reference citations embedded in <span> tags
                            for ref_tag in paragraph.find_all('span', class_='dropBlock reference-citations'):
                                ref_tag.decompose()

                            # Extract the paragraph text
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            section_text += f"{paragraph_text}\n"

                        for section_h4 in section_h3.find_all('section'):
                            ## Extract subsection title (h4)
                            #section_h4_title_tag = section_h4.find('h4')
                            #if section_h4_title_tag:
                            #    section_h4_title = ' '.join(section_h4_title_tag.get_text(' ', strip=True).split())
                            #    section_text += f"\nSection: {section_h4_title}\n\n"

                            # Extract h4 paragraphs
                            paragraphs = section_h4.find_all('div', role='paragraph', recursive=False)
                            for paragraph in paragraphs:
                                # Remove reference citations embedded in <span> tags
                                for ref_tag in paragraph.find_all('span', class_='dropBlock reference-citations'):
                                    ref_tag.decompose()

                                # Extract the paragraph text
                                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                section_text += f"{paragraph_text}\n"

                            for section_h5 in section_h4.find_all('section'):
                                ## Extract subsection title (h5)
                                #section_h5_title_tag = section_h5.find('h5')
                                #if section_h5_title_tag:
                                #    section_h5_title = ' '.join(section_h5_title_tag.get_text(' ', strip=True).split())
                                #    section_text += f"\nSection: {section_h5_title}\n\n"

                                # Extract h5 paragraphs
                                paragraphs = section_h5.find_all('div', role='paragraph', recursive=False)
                                for paragraph in paragraphs:
                                    # Remove reference citations embedded in <span> tags
                                    for ref_tag in paragraph.find_all('span', class_='dropBlock reference-citations'):
                                        ref_tag.decompose()

                                    # Extract the paragraph text
                                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                    section_text += f"{paragraph_text}\n"

                    text += section_text  # Append structured section text

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [None]:
extract_text(df_lancet_public_health_open_access, path)

### [New England Journal of Medicine](https://www.nejm.org/)

#### Create output subdirectory

In [None]:
# 'New England Journal of Medicine'
id = 'nejm'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_new_england_journal_of_medicine_open_access = pd.read_json(f"{input_directory}/new_england_journal_of_medicine_open_access.jsonl", lines=True)

In [None]:
df_new_england_journal_of_medicine_open_access['Published'] = pd.to_datetime(df_new_england_journal_of_medicine_open_access['Published'], unit='ms')

In [None]:
df_new_england_journal_of_medicine_open_access = df_new_england_journal_of_medicine_open_access.loc[:4]

In [None]:
nejm_urls = df_new_england_journal_of_medicine_open_access['URL'].tolist()
nejm_urls

In [None]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the 'Title'
        title_tag = soup.find('h1', property='name')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"Title: {title}\n\n"

        # Extract the 'Abstract'
        abstract_section = soup.find('section', property='abstract')
        if abstract_section:
            abstract_tag = abstract_section.find('h2', property='name')
            if abstract_tag:
                abstract = ' '.join(abstract_tag.get_text(' ', strip=True).split())
                text += f"\nAbstract: {abstract}\n\n"

            for section_h3 in abstract_section.find_all('section', recursive=False):
                section_h3_title_tag = section_h3.find('h3')
                if section_h3_title_tag:
                    section_h3_title = ' '.join(section_h3_title_tag.get_text(' ', strip=True).split())
                    text += f"\nSection: {section_h3_title}\n\n"

                # Extract paragraphs within each section
                paragraphs = section_h3.find_all('div', role='paragraph', recursive=False)
                for paragraph in paragraphs:
                    # Remove reference citations embedded in <span> tags
                    for sup_tag in paragraph.find_all('sup'):
                        sup_tag.decompose()

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Extract the article body
        body_section = soup.find('section', property='articleBody')
        if body_section:
            body_section_core_container = body_section.find('div', class_='core-container')
            if body_section_core_container:
                # Extract the initial paragraphs that precede the first section (introduction)
                text += f"\nIntroduction\n\n" # Insert the 'Introduction' title
                paragraphs = body_section_core_container.find_all('div', role='paragraph', recursive=False) # Prevents nested extraction
                for paragraph in paragraphs:
                    # Remove reference citations embedded in <span> tags
                    for sup_tag in paragraph.find_all('sup'):
                        sup_tag.decompose()

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

                # Extract sectioned content
                for section_h2 in body_section_core_container.find_all('section', recursive=False):
                    section_text = ''  # Reset for each section

                    # Extract section title (h2)
                    section_h2_title_tag = section_h2.find('h2')
                    if section_h2_title_tag:
                        section_h2_title = ' '.join(section_h2_title_tag.get_text(' ', strip=True).split())
                        section_text += f"\nSection: {section_h2_title}\n\n"
                    
                    # Extract h2 paragraphs, if there are any
                    paragraphs = section_h2.find_all('div', role='paragraph', recursive=False)
                    for paragraph in paragraphs:
                        # Remove reference citations embedded in <span> tags
                        for sup_tag in paragraph.find_all('sup'):
                            sup_tag.decompose()

                        # Extract the paragraph text
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        section_text += f"{paragraph_text}\n"

                    for section_h3 in section_h2.find_all('section', recursive=False):
                        ## Extract subsection title (h3)
                        #section_h3_title_tag = section_h3.find('h3')
                        #if section_h3_title_tag:
                        #    section_h3_title = ' '.join(section_h3_title_tag.get_text(' ', strip=True).split())
                        #    section_text += f"\nSection: {section_h3_title}\n\n"

                        # Extract h3 paragraphs
                        paragraphs = section_h3.find_all('div', role='paragraph', recursive=False)
                        for paragraph in paragraphs:
                            # Remove reference citations embedded in <span> tags
                            for sup_tag in paragraph.find_all('sup'):
                                sup_tag.decompose()

                            # Extract the paragraph text
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            section_text += f"{paragraph_text}\n"

                        for section_h4 in section_h3.find_all('section', recursive=False):
                            ## Extract subsection title (h4)
                            #section_h4_title_tag = section_h4.find('h4')
                            #if section_h4_title_tag:
                            #    section_h4_title = ' '.join(section_h4_title_tag.get_text(' ', strip=True).split())
                            #    section_text += f"\nSection: {section_h4_title}\n\n"

                            # Extract h4 paragraphs
                            paragraphs = section_h4.find_all('div', role='paragraph', recursive=False)
                            for paragraph in paragraphs:
                                # Remove reference citations embedded in <span> tags
                                for sup_tag in paragraph.find_all('sup'):
                                    sup_tag.decompose()

                                # Extract the paragraph text
                                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                section_text += f"{paragraph_text}\n"

                            for section_h5 in section_h4.find_all('section', recursive=False):
                                ## Extract subsection title (h5)
                                #section_h5_title_tag = section_h5.find('h5')
                                #if section_h5_title_tag:
                                #    section_h5_title = ' '.join(section_h5_title_tag.get_text(' ', strip=True).split())
                                #    section_text += f"\nSection: {section_h5_title}\n\n"

                                # Extract h5 paragraphs
                                paragraphs = section_h5.find_all('div', role='paragraph', recursive=False)
                                for paragraph in paragraphs:
                                    # Remove reference citations embedded in <span> tags
                                    for sup_tag in paragraph.find_all('sup'):
                                        sup_tag.decompose()

                                    # Extract the paragraph text
                                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                    section_text += f"{paragraph_text}\n"

                    text += section_text  # Append structured section text

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [None]:
extract_text(df_new_england_journal_of_medicine_open_access, path)

## Biological Sciences

### [Cell](https://www.cell.com/cell/home)

#### Create output subdirectory

In [None]:
# 'Cell'
id = 'cell'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_cell_open_access = pd.read_json(f"{input_directory}/cell_open_access.jsonl", lines=True)

In [None]:
df_cell_open_access['Published'] = pd.to_datetime(df_cell_open_access['Published'], unit='ms')

In [None]:
df_cell_open_access = df_cell_open_access.loc[:4]

In [None]:
cell_urls = df_cell_open_access['URL'].tolist()
cell_urls

In [None]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Web Scraping - Begin

        # Extract the 'Title'
        title = soup.find('h1', property='name')
        if title:
            title_text = ' '.join(title.get_text(' ', strip=True).split())
            text += f"Title: {title_text}\n\n"

        # Extract the 'Abstract'
        abstract_section = soup.find('div', id='abstracts')
        if abstract_section:
            author_abstract_section = abstract_section.find('section', id='author-abstract')
            if author_abstract_section:
                author_abstract_h2_title = author_abstract_section.find('h2', property='name')
                if author_abstract_h2_title:
                    author_abstract_h2_title_text = ' '.join(author_abstract_h2_title.get_text(' ', strip=True).split())
                    text += f"\nAbstract: {author_abstract_h2_title_text}\n\n"
                for paragraph in author_abstract_section.find_all('div', role='paragraph', recursive=False):
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Extract the 'body'
        body_section = soup.find('section', id='bodymatter')
        if body_section:
            body_core_container = body_section.find('div', class_='core-container')
            if body_core_container:
                # Extract sections
                for section_h2 in body_core_container.find_all('section', recursive=False):
                    # Extract section title
                    section_h2_title = section_h2.find('h2')
                    if section_h2_title:
                        section_h2_title_text = ' '.join(section_h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {section_h2_title_text}\n\n"
                    # Extract section paragraphs
                    for paragraph in section_h2.find_all('div', role='paragraph', recursive=False):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"

                    # Extract subsections
                    for section_h3 in section_h2.find_all('section', recursive=False):
                        ## Extract subsection title
                        #section_h3_title = section_h3.find('h3')
                        #if section_h3_title:
                        #    section_h3_title_text = ' '.join(section_h3_title.get_text(' ', strip=True).split())
                        #    text += f"\nSubsection: {section_h3_title_text}\n\n"
                        # Extract subsection paragraphs
                        for paragraph in section_h3.find_all('div', role='paragraph', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"

                        # Extract subsubsections
                        for section_h4 in section_h3.find_all('section', recursive=False):
                            ## Extract subsubsection title
                            #section_h4_title = section_h4.find('h4')
                            #if section_h4_title:
                            #    section_h4_title_text = ' '.join(section_h4_title.get_text(' ', strip=True).split())
                            #    text += f"\nSubsubsection: {section_h4_title_text}\n\n"
                            # Extract subsubsection paragraphs
                            for paragraph in section_h4.find_all('div', role='paragraph', recursive=False):
                                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                text += f"{paragraph_text}\n"

                            # Extract subsubsubsections
                            for section_h5 in section_h4.find_all('section', recursive=False):
                                ## Extract subsubsubsection title
                                #section_h5_title = section_h5.find('h5')
                                #if section_h5_title:
                                #    section_h5_title_text = ' '.join(section_h5_title.get_text(' ', strip=True).split())
                                #    text += f"\nSubsubsubsection: {section_h5_title_text}\n\n"
                                # Extract subsubsubsection paragraphs
                                for paragraph in section_h5.find_all('div', role='paragraph', recursive=False):
                                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                    text += f"{paragraph_text}\n"

        # Web Scraping - End

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [None]:
extract_text(df_cell_open_access, path)

### [American Journal of Human Biology](https://onlinelibrary.wiley.com/journal/15206300?msockid=0525cb73d9a76a060b80df20d87e6b4b)

#### Create output subdirectory

In [None]:
# 'American Journal of Human Biology'
id = 'ajhb'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_american_journal_human_biology_open_access = pd.read_json(f"{input_directory}/american_journal_human_biology_open_access.jsonl", lines=True)

In [None]:
df_american_journal_human_biology_open_access['Published'] = pd.to_datetime(df_american_journal_human_biology_open_access['Published'], unit='ms')

In [None]:
df_american_journal_human_biology_open_access = df_american_journal_human_biology_open_access.loc[:4]

In [None]:
ajhb_urls = df_american_journal_human_biology_open_access['URL'].tolist()
ajhb_urls

In [None]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Web Scraping - Begin

        # Extract the 'Title'
        title = soup.find('h1', class_='citation__title')
        if title:
            title_text = ' '.join(title.get_text(' ', strip=True).split())
            text += f"Title: {title_text}\n\n"

        # Capture the 'article body'
        article_body = soup.find('div', class_='article__body')
        
        # Extract the 'Abstract'
        if article_body:
            abstract_section = article_body.find('section', class_='article-section__abstract')
            if abstract_section:
                h2_title = abstract_section.find('h2')
                if h2_title:
                    h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                    text += f"\nAbstract: {h2_title_text}\n\n"
                abstract_content = abstract_section.find('div', class_='article-section__content en main')
                if abstract_content:
                    for paragraph in abstract_content.find_all('p', recursive=False):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"
                    for section in abstract_content.find_all('section', recursive=False):
                        h3_title = section.find('h3')
                        if h3_title:
                            h3_title_text = ' '.join(h3_title.get_text(' ', strip=True).split())
                            text += f"\nSection: {h3_title_text}\n\n"
                        for paragraph in section.find_all('p', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"

        # Extract the 'body'
        if article_body:
            body_section = article_body.find('section', class_='article-section article-section__full')
            if body_section:
                for h2_section in body_section.find_all('section', class_='article-section__content', recursive=False):
                    h2_title = h2_section.find('h2')
                    if h2_title:
                        h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {h2_title_text}\n\n"
                    for paragraph in h2_section.find_all('p', recursive=False):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"
                    for h3_section in h2_section.find_all('section', recursive=False):
                        #h3_title = h3_section.find('h3')
                        #if h3_title:
                        #    h3_title_text = ' '.join(h3_title.get_text(' ', strip=True).split())
                        #    text += f"\nSection: {h3_title_text}\n\n"
                        for paragraph in h3_section.find_all('p', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"
                        for h4_section in h3_section.find_all('section', recursive=False):
                            #h4_title = h4_section.find('h4')
                            #if h4_title:
                            #    h4_title_text = ' '.join(h4_title.get_text(' ', strip=True).split())
                            #    text += f"\nSection: {h4_title_text}\n\n"
                            for paragraph in h4_section.find_all('p', recursive=False):
                                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                text += f"{paragraph_text}\n"
                            for h5_section in h4_section.find_all('section', recursive=False):
                                #h5_title = h5_section.find('h5')
                                #if h5_title:
                                #    h5_title_text = ' '.join(h5_title.get_text(' ', strip=True).split())
                                #    text += f"\nSection: {h5_title_text}\n\n"
                                for paragraph in h5_section.find_all('p', recursive=False):
                                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                    text += f"{paragraph_text}\n"

        # Extract the 'Acknowledgements'
        if body_section:
            for h2_section in body_section.find_all('div', class_='article-section__content', recursive=False):
                h2_title = h2_section.find('h2')
                if h2_title:
                    h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                    text += f"\nSection: {h2_title_text}\n\n"
                for paragraph in h2_section.find_all('p', recursive=False):
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Web Scraping - End

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [None]:
extract_text(df_american_journal_human_biology_open_access, path)

## Human Sciences

### [Annual Review of Anthropology](https://www.annualreviews.org/content/journals/anthro)

#### Create output subdirectory

In [6]:
# 'Annual Review of Anthropology'
id = 'aran'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\aran


#### Import the data into a DataFrame

In [7]:
df_ar_anthropology = pd.read_json(f"{input_directory}/ar_anthropology.jsonl", lines=True)

In [8]:
df_ar_anthropology['Published'] = pd.to_datetime(df_ar_anthropology['Published'], unit='ms')

In [9]:
df_ar_anthropology = df_ar_anthropology.loc[:4]

In [10]:
aran_urls = df_ar_anthropology['URL'].tolist()
aran_urls

['https://www.annualreviews.org/content/journals/10.1146/annurev-an-51-082222-100001',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-070120-111609',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-041420-112543',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-041420-102158',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-041420-013930']

In [15]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the 'Title'
        title_tag = soup.find('span', class_='article-title')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"Title: {title}\n\n"

        # Extract 'article sections'
        for section in soup.find_all('div', class_='articleSection'): # Finds all 'div.articleSection' elements (both top-level and nested).
            if not section.find_parent('div', class_='articleSection'): # Keeps only the top-level sections because it filters out nested ones by checking if the 'div.articleSection' has a parent that is also 'div.articleSection'
                
                # Extract section title
                section_title_tag = section.find('div', class_='tl-main-part title')
                if section_title_tag:
                    section_title = ' '.join(section_title_tag.get_text(' ', strip=True).split())
                    text += f"\nSection: {section_title}\n\n"

                # Extract paragraphs (only from top-level sections)
                for paragraph in section.find_all('p'):
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

                # Extract subsections
                for subsection in section.find_all('div', recursive=False):
                    #label_tag = subsection.find('span', class_='label')
                    #if label_tag:
                    #    label_text = ' '.join(label_tag.get_text(' ', strip=True).split())
                    #    text += f"\nSection: {label_text} "
                    #subsection_tag = subsection.find('span', class_='tl-lowest-section')
                    #if subsection_tag:
                    #    subsection_title = ' '.join(subsection_tag.get_text(' ', strip=True).split())
                    #    text += f"{subsection_title}\n\n"
                    for paragraph in subsection.find_all('p'):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [16]:
extract_text(df_ar_anthropology, path)

### [Journal of Human Evolution](https://www.sciencedirect.com/journal/journal-of-human-evolution)

#### Create output subdirectory

In [24]:
# 'Journal of Human Evolution'
id = 'jhue'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\jhue


#### Import the data into a DataFrame

In [25]:
df_journal_human_evolution_open_access = pd.read_json(f"{input_directory}/journal_human_evolution_open_access.jsonl", lines=True)

In [26]:
df_journal_human_evolution_open_access['Published'] = pd.to_datetime(df_journal_human_evolution_open_access['Published'], unit='ms')

In [27]:
df_journal_human_evolution_open_access = df_journal_human_evolution_open_access.loc[:4]

In [28]:
jhue_urls = df_journal_human_evolution_open_access['URL'].tolist()
jhue_urls

['https://www.sciencedirect.com//science/article/pii/S0047248420300294',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301123',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301135',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301305',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301615']

In [78]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Web Scraping - Begin

        # Extract the 'Title'
        title = soup.find('h1', id='screen-reader-main-title')
        if title:
            title_text = ' '.join(title.get_text(' ', strip=True).split())
            text += f"Title: {title_text}\n\n"

        # Capture the 'article body'
        article_body = soup.find('article')
        
        # Extract the 'Abstract'
        if article_body:
            abstract_section = article_body.find('div', id='abstracts')
            if abstract_section:
                abstract_author = abstract_section.find('div', class_='abstract author')
                #abstract_author = abstract_section.find('div', attrs={'class': 'abstract author'})
                #abstract_author = abstract_section.find('div', class_=['abstract', 'author']) # Results in match for 'author' or 'author-highlights', failing to reinforce 'author'
                if abstract_author:
                    h2_title = abstract_author.find('h2')
                    if h2_title:
                        h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                        text += f"\nAbstract: {h2_title_text}\n\n"
                    abstract_content = abstract_author.find('div')
                    if abstract_content:
                        for paragraph in abstract_content.find_all('div', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"

        # Extract the 'body'
        if article_body:
            body_section = article_body.find('div', id='body')
            if body_section:
                body_section1 = body_section.find('div')
                if body_section1:
                    for h2_section in body_section1.find_all('section', recursive=False):
                        h2_title = h2_section.find('h2')
                        if h2_title:
                            h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                            text += f"\nSection: {h2_title_text}\n\n"
                        for paragraph in h2_section.find_all('div', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"
                        for h3_section in h2_section.find_all('section', recursive=False):
                            #h3_title = h3_section.find('h3')
                            #if h3_title:
                            #    h3_title_text = ' '.join(h3_title.get_text(' ', strip=True).split())
                            #    text += f"\nSection: {h3_title_text}\n\n"
                            for paragraph in h3_section.find_all('div', recursive=False):
                                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                text += f"{paragraph_text}\n"
                            for h4_section in h3_section.find_all('section', recursive=False):
                                #h4_title = h4_section.find('h4')
                                #if h4_title:
                                #    h4_title_text = ' '.join(h4_title.get_text(' ', strip=True).split())
                                #    text += f"\nSection: {h4_title_text}\n\n"
                                for paragraph in h4_section.find_all('div', recursive=False):
                                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                    text += f"{paragraph_text}\n"
                                for h5_section in h4_section.find_all('section', recursive=False):
                                    #h5_title = h5_section.find('h5')
                                    #if h5_title:
                                    #    h5_title_text = ' '.join(h5_title.get_text(' ', strip=True).split())
                                    #    text += f"\nSection: {h5_title_text}\n\n"
                                    for paragraph in h5_section.find_all('div', recursive=False):
                                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                        text += f"{paragraph_text}\n"
                for body_section2 in body_section.find_all('section', recursive=False):
                    h2_title = body_section2.find('h2')
                    if h2_title:
                        h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {h2_title_text}\n\n"
                    for paragraph in body_section2.find_all('div', recursive=False):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"

        # Web Scraping - End

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [79]:
extract_text(df_journal_human_evolution_open_access, path)

## Applied Social Sciences

### [Journal of Applied Social Science](https://journals.sagepub.com/home/jax)

#### Create output subdirectory

In [None]:
# 'Journal of Applied Social Science'
id = 'jasc'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_journal_applied_social_science_open_access = pd.read_json(f"{input_directory}/journal_applied_social_science_open_access.jsonl", lines=True)

In [None]:
df_journal_applied_social_science_open_access['Published'] = pd.to_datetime(df_journal_applied_social_science_open_access['Published'], unit='ms')

In [None]:
df_journal_applied_social_science_open_access = df_journal_applied_social_science_open_access.loc[:4]

In [None]:
jasc_urls = df_journal_applied_social_science_open_access['URL'].tolist()
jasc_urls

### [Journal of Social Issues](https://spssi.onlinelibrary.wiley.com/journal/15404560)

#### Create output subdirectory

In [None]:
# 'Journal of Social Issues'
id = 'jsoi'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_journal_social_issues_open_access = pd.read_json(f"{input_directory}/journal_social_issues_open_access.jsonl", lines=True)

In [None]:
df_journal_social_issues_open_access['Published'] = pd.to_datetime(df_journal_social_issues_open_access['Published'], unit='ms')

In [None]:
df_journal_social_issues_open_access = df_journal_social_issues_open_access.loc[:4]

In [None]:
jsoi_urls = df_journal_social_issues_open_access['URL'].tolist()
jsoi_urls

In [None]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Web Scraping - Begin

        # Extract the 'Title'
        title = soup.find('h1', class_='citation__title')
        if title:
            title_text = ' '.join(title.get_text(' ', strip=True).split())
            text += f"Title: {title_text}\n\n"

        # Capture the 'article body'
        article_body = soup.find('div', class_='article__body')
        
        # Extract the 'Abstract'
        if article_body:
            abstract_section = article_body.find('section', class_='article-section__abstract')
            if abstract_section:
                h2_title = abstract_section.find('h2')
                if h2_title:
                    h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                    text += f"\nAbstract: {h2_title_text}\n\n"
                abstract_content = abstract_section.find('div', class_='article-section__content en main')
                if abstract_content:
                    for paragraph in abstract_content.find_all('p', recursive=False):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"
                    for section in abstract_content.find_all('section', recursive=False):
                        h3_title = section.find('h3')
                        if h3_title:
                            h3_title_text = ' '.join(h3_title.get_text(' ', strip=True).split())
                            text += f"\nSection: {h3_title_text}\n\n"
                        for paragraph in section.find_all('p', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"

        # Extract the 'body'
        if article_body:
            body_section = article_body.find('section', class_='article-section article-section__full')
            if body_section:
                for h2_section in body_section.find_all('section', class_='article-section__content', recursive=False):
                    h2_title = h2_section.find('h2')
                    if h2_title:
                        h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                        text += f"\nSection: {h2_title_text}\n\n"
                    for paragraph in h2_section.find_all('p', recursive=False):
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        text += f"{paragraph_text}\n"
                    for h3_section in h2_section.find_all('section', recursive=False):
                        #h3_title = h3_section.find('h3')
                        #if h3_title:
                        #    h3_title_text = ' '.join(h3_title.get_text(' ', strip=True).split())
                        #    text += f"\nSection: {h3_title_text}\n\n"
                        for paragraph in h3_section.find_all('p', recursive=False):
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            text += f"{paragraph_text}\n"
                        for h4_section in h3_section.find_all('section', recursive=False):
                            #h4_title = h4_section.find('h4')
                            #if h4_title:
                            #    h4_title_text = ' '.join(h4_title.get_text(' ', strip=True).split())
                            #    text += f"\nSection: {h4_title_text}\n\n"
                            for paragraph in h4_section.find_all('p', recursive=False):
                                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                text += f"{paragraph_text}\n"
                            for h5_section in h4_section.find_all('section', recursive=False):
                                #h5_title = h5_section.find('h5')
                                #if h5_title:
                                #    h5_title_text = ' '.join(h5_title.get_text(' ', strip=True).split())
                                #    text += f"\nSection: {h5_title_text}\n\n"
                                for paragraph in h5_section.find_all('p', recursive=False):
                                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                                    text += f"{paragraph_text}\n"

        # Extract the 'Acknowledgements'
        if body_section:
            for h2_section in body_section.find_all('div', class_='article-section__content', recursive=False):
                h2_title = h2_section.find('h2')
                if h2_title:
                    h2_title_text = ' '.join(h2_title.get_text(' ', strip=True).split())
                    text += f"\nSection: {h2_title_text}\n\n"
                for paragraph in h2_section.find_all('p', recursive=False):
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Web Scraping - End

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [None]:
extract_text(df_journal_social_issues_open_access, path)

### [Social Science & Medicine](https://www.sciencedirect.com/journal/social-science-and-medicine)

#### Create output subdirectory

In [None]:
# 'Social Science & Medicine'
id = 'socm'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_social_science_medicine_open_access = pd.read_json(f"{input_directory}/social_science_medicine_open_access.jsonl", lines=True)

In [None]:
df_social_science_medicine_open_access['Published'] = pd.to_datetime(df_social_science_medicine_open_access['Published'], unit='ms')

In [None]:
df_social_science_medicine_open_access = df_social_science_medicine_open_access.loc[:4]

In [None]:
socm_urls = df_social_science_medicine_open_access['URL'].tolist()
socm_urls

## Linguistics, literature and arts

### [Applied Corpus Linguistics](https://www.sciencedirect.com/journal/applied-corpus-linguistics)

#### Create output subdirectory

In [None]:
# 'Applied Corpus Linguistics'
id = 'apcl'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_applied_corpus_linguistics_open_access = pd.read_json(f"{input_directory}/applied_corpus_linguistics_open_access.jsonl", lines=True)

In [None]:
df_applied_corpus_linguistics_open_access['Published'] = pd.to_datetime(df_applied_corpus_linguistics_open_access['Published'], unit='ms')

In [None]:
df_applied_corpus_linguistics_open_access = df_applied_corpus_linguistics_open_access.loc[:4]

In [None]:
apcl_urls = df_applied_corpus_linguistics_open_access['URL'].tolist()
apcl_urls

### [Journal of English Linguistics](https://journals.sagepub.com/home/eng)

#### Create output subdirectory

In [None]:
# 'Journal of English Linguistics'
id = 'jenl'
path = os.path.join(output_directory, id)
create_directory(path)

#### Import the data into a DataFrame

In [None]:
df_journal_english_linguistics_open_access = pd.read_json(f"{input_directory}/journal_english_linguistics_open_access.jsonl", lines=True)

In [None]:
df_journal_english_linguistics_open_access['Published'] = pd.to_datetime(df_journal_english_linguistics_open_access['Published'], unit='ms')

In [None]:
df_journal_english_linguistics_open_access = df_journal_english_linguistics_open_access.loc[:4]

In [None]:
jenl_urls = df_journal_english_linguistics_open_access['URL'].tolist()
jenl_urls