<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 2 - eyamrog

The aim of this phase is to select samples per disciple and web scrape the research articles.

## Required Python packages

- beautifulsoup4
- lxml
- pandas
- requests
- selenium
- tqdm

## Import the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import sys
import time
import logging
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Define input variables

In [2]:
input_directory = 'cl_st2_ph1_eyamrog'
output_directory = 'cl_st2_ph2_eyamrog'
log_filename = f"{output_directory}/{output_directory}.log"

## Set up logging

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_filename
)

## Create output directory

In [4]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory already exists.


## Scrape web page functions

In [4]:
def scrape_html_docs1(df, path):
    """Iterates over a DataFrame and saves HTML pages within a single WebDriver session."""
    # Setting up the WebDriver
    #service = Service(r'C:\Users\eyamr\OneDrive\00-Technology\laelgelc\edgedriver_win64\msedgedriver.exe')
    service = Service('/Users/eyamrog/laelgelc/edgedriver_mac64/msedgedriver')
    #service = Service('/home/parallels/laelgelc/edgedriver_linux64/msedgedriver')
    driver = webdriver.Edge(service=service)

    if not os.path.exists(path):
        try:
            os.makedirs(path)
        except OSError as e:
            logging.error(f"Failed to create the {path} directory: {e}")
            sys.exit(1)

    # Iterate over the DataFrame
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Scraping HTML documents"):
        try:
            url = row['URL']
            doc_id = row['ID']
            filename = os.path.join(path, f"{doc_id}.html")

            # Load web page
            driver.get(url)
            # Explicit wait for page elements
            wait = WebDriverWait(driver, 10)

            # Ensure stable page load
            max_wait_time = 30
            start_time = time.time()
            previous_html = ''
            while True:
                current_html = driver.page_source
                if current_html == previous_html or time.time() - start_time > max_wait_time:
                    break
                previous_html = current_html
                time.sleep(2)

            # Capture page source
            page_source = driver.page_source

            # Save content to file
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(page_source)

            logging.info(f"Saved: {filename}")

        except Exception as e:
            logging.error(f"Error capturing {url}: {e}")

    # Close WebDriver
    driver.quit()

In [5]:
def scrape_html(url):
    """Loads a web page and returns its source HTML."""
    # Setting up the WebDriver
    #service = Service(r'C:\Users\eyamr\OneDrive\00-Technology\laelgelc\edgedriver_win64\msedgedriver.exe')
    service = Service('/Users/eyamrog/laelgelc/edgedriver_mac64/msedgedriver')
    #service = Service('/home/parallels/laelgelc/edgedriver_linux64/msedgedriver')
    driver = webdriver.Edge(service=service)
    try:
        driver.get(url)

        # Explicit wait for stable page load
        wait = WebDriverWait(driver, 10)
        max_wait_time = 30
        start_time = time.time()
        previous_html = ''

        while True:
            current_html = driver.page_source
            if current_html == previous_html or time.time() - start_time > max_wait_time:
                break
            previous_html = current_html
            time.sleep(2)

        return driver.page_source  # Return page source
    except Exception as e:
        logging.error(f"Error scraping {url}: {e}")
        return None

    # Close WebDriver
    driver.quit()

In [6]:
def scrape_html_docs2(df, path):
    """Iterates over a DataFrame and saves HTML pages within multiple WebDriver sessions."""
    if not os.path.exists(path):
        try:
            os.makedirs(path)
        except OSError as e:
            logging.error(f"Failed to create the {path} directory: {e}")
            sys.exit(1)

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Scraping HTML documents"):
        url = row['URL']
        doc_id = row['ID']
        filename = os.path.join(path, f"{doc_id}.html")

        page_source = scrape_html(url)  # Call scrape_html function

        if page_source:
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(page_source)
            logging.info(f"Saved: {filename}")

## Health Sciences

### [Nature Medicine](https://www.nature.com/nm/)

#### Import the data into a DataFrame

In [8]:
df_nature_medicine_open_access = pd.read_json(f"{input_directory}/nature_medicine_open_access.jsonl", lines=True)

In [9]:
df_nature_medicine_open_access['Published'] = pd.to_datetime(df_nature_medicine_open_access['Published'], unit='ms')

In [10]:
df_nature_medicine_open_access.rename(columns={
    'Authors (compact list)': 'Authors',
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [11]:
journal = 'Nature Medicine'
id = 'natm'
path = os.path.join(output_directory, id)

In [12]:
df_nature_medicine_open_access['Journal'] = journal

In [13]:
df_nature_medicine_open_access['ID'] = id + df_nature_medicine_open_access.index.astype(str).str.zfill(6)

In [14]:
df_nature_medicine_open_access.to_json(f"{output_directory}/nature_medicine_open_access.jsonl", orient='records', lines=True)

In [15]:
scrape_html_docs1(df_nature_medicine_open_access, path)

### [Annual Review of Public Health](https://www.annualreviews.org/content/journals/publhealth)

#### Import the data into a DataFrame

In [16]:
df_ar_public_health = pd.read_json(f"{input_directory}/ar_public_health.jsonl", lines=True)

In [17]:
# Extract the year using regex
df_ar_public_health['Published'] = df_ar_public_health['Vol/Year/Page Range'].str.extract(r'\((\d{4})\)')

In [18]:
df_ar_public_health['Published'] = pd.to_datetime(
    df_ar_public_health['Published'], format='mixed', errors='coerce'
)

In [19]:
df_ar_public_health.rename(columns={
    'Vol/Year/Page Range': 'Vol/Issue',
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [20]:
journal = 'Annual Review of Public Health'
id = 'arph'
path = os.path.join(output_directory, id)

In [21]:
df_ar_public_health['Journal'] = journal

In [22]:
df_ar_public_health['ID'] = id + df_ar_public_health.index.astype(str).str.zfill(6)

In [23]:
df_ar_public_health.to_json(f"{output_directory}/ar_public_health.jsonl", orient='records', lines=True)

In [24]:
scrape_html_docs1(df_ar_public_health, path)

### [Lancet Public Health](https://www.thelancet.com/journals/lanpub/home)

#### Import the data into a DataFrame

In [25]:
df_lancet_public_health_open_access = pd.read_json(f"{input_directory}/lancet_public_health_open_access.jsonl", lines=True)

In [26]:
df_lancet_public_health_open_access['Published'] = pd.to_datetime(df_lancet_public_health_open_access['Published'], unit='ms')

In [27]:
df_lancet_public_health_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [28]:
journal = 'Lancet Public Health'
id = 'laph'
path = os.path.join(output_directory, id)

In [29]:
df_lancet_public_health_open_access['Journal'] = journal

In [30]:
df_lancet_public_health_open_access['ID'] = id + df_lancet_public_health_open_access.index.astype(str).str.zfill(6)

In [31]:
df_lancet_public_health_open_access.to_json(f"{output_directory}/lancet_public_health_open_access.jsonl", orient='records', lines=True)

In [32]:
scrape_html_docs2(df_lancet_public_health_open_access, path)

### [New England Journal of Medicine](https://www.nejm.org/)

#### Import the data into a DataFrame

In [7]:
df_new_england_journal_of_medicine_open_access = pd.read_json(f"{input_directory}/new_england_journal_of_medicine_open_access.jsonl", lines=True)

In [8]:
df_new_england_journal_of_medicine_open_access['Published'] = pd.to_datetime(df_new_england_journal_of_medicine_open_access['Published'], unit='ms')

In [9]:
df_new_england_journal_of_medicine_open_access.rename(columns={
    'Free Access': 'Open Access',
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [10]:
journal = 'New England Journal of Medicine'
id = 'nejm'
path = os.path.join(output_directory, id)

In [11]:
df_new_england_journal_of_medicine_open_access['Journal'] = journal

In [12]:
df_new_england_journal_of_medicine_open_access['ID'] = id + df_new_england_journal_of_medicine_open_access.index.astype(str).str.zfill(6)

In [39]:
df_new_england_journal_of_medicine_open_access.to_json(f"{output_directory}/new_england_journal_of_medicine_open_access.jsonl", orient='records', lines=True)

In [40]:
scrape_html_docs2(df_new_england_journal_of_medicine_open_access, path)

##### Retry missing documents

The following missing documents were identified by examining the logs:
- nejm000061.html
- nejm000148.html
- nejm000229.html
- nejm000230.html
- nejm000231.html
- nejm000232.html
- nejm000233.html

In [13]:
nejm_missing = [
    'nejm000061',
    'nejm000148',
    'nejm000229',
    'nejm000230',
    'nejm000231',
    'nejm000232',
    'nejm000233'
]

In [14]:
# Filter the DataFrame
df_new_england_journal_of_medicine_open_access_missing = df_new_england_journal_of_medicine_open_access[
    df_new_england_journal_of_medicine_open_access['ID'].isin(nejm_missing)
]

In [15]:
df_new_england_journal_of_medicine_open_access_missing

Unnamed: 0,Article Type,Title,URL,Authors,Vol/Issue,Published,DOI,Open Access,PDF URL,Discipline,Journal,ID
61,Original Articles,Initial Invasive or Conservative Strategy for ...,https://www.nejm.org/doi/full/10.1056/NEJMoa19...,D.J. Maron and Others,"Volume 382, Issue 15",2020-03-30,https://doi.org10.1056/NEJMoa1915922,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa191...,Health Sciences,New England Journal of Medicine,nejm000061
148,Original Articles,Effects of Diet versus Gastric Bypass on Metab...,https://www.nejm.org/doi/full/10.1056/NEJMoa20...,M. Yoshino and Others,"Volume 383, Issue 8",2020-08-19,https://doi.org10.1056/NEJMoa2003697,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa200...,Health Sciences,New England Journal of Medicine,nejm000148
229,Original Articles,Changes in Seizure Frequency and Antiepileptic...,https://www.nejm.org/doi/full/10.1056/NEJMoa20...,P.B. Pennell and Others,"Volume 383, Issue 26",2020-12-23,https://doi.org10.1056/NEJMoa2008663,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa200...,Health Sciences,New England Journal of Medicine,nejm000229
230,Original Articles,Safety and Efficacy of the BNT162b2 mRNA Covid...,https://www.nejm.org/doi/full/10.1056/NEJMoa20...,F.P. Polack and Others,"Volume 383, Issue 27",2020-12-10,https://doi.org10.1056/NEJMoa2034577,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa203...,Health Sciences,New England Journal of Medicine,nejm000230
231,Original Articles,Trial of Dexamethasone for Chronic Subdural He...,https://www.nejm.org/doi/full/10.1056/NEJMoa20...,P.J. Hutchinson and Others,"Volume 383, Issue 27",2020-12-16,https://doi.org10.1056/NEJMoa2020473,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa202...,Health Sciences,New England Journal of Medicine,nejm000231
232,Original Articles,Somatic Mutations in UBA1 and Severe Adult-Ons...,https://www.nejm.org/doi/full/10.1056/NEJMoa20...,D.B. Beck and Others,"Volume 383, Issue 27",2020-10-27,https://doi.org10.1056/NEJMoa2026834,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa202...,Health Sciences,New England Journal of Medicine,nejm000232
233,Original Articles,Higher or Lower Hemoglobin Transfusion Thresho...,https://www.nejm.org/doi/full/10.1056/NEJMoa20...,H. Kirpalani and Others,"Volume 383, Issue 27",2021-12-30,https://doi.org10.1056/NEJMoa2020248,FREE,https://www.nejm.org/doi/pdf/10.1056/NEJMoa202...,Health Sciences,New England Journal of Medicine,nejm000233


In [16]:
nejm_missing_url = df_new_england_journal_of_medicine_open_access_missing['URL'].tolist()
nejm_missing_url

['https://www.nejm.org/doi/full/10.1056/NEJMoa1915922',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa2003697',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa2008663',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa2034577',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa2020473',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa2026834',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa2020248']

In [17]:
scrape_html_docs2(df_new_england_journal_of_medicine_open_access_missing, path)

Scraping HTML documents: 100%|██████████| 7/7 [02:34<00:00, 22.12s/it]


## Biological Sciences

### [Cell](https://www.cell.com/cell/home)

#### Import the data into a DataFrame

In [None]:
df_cell_open_access = pd.read_json(f"{input_directory}/cell_open_access.jsonl", lines=True)

In [None]:
df_cell_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Cell'
id = 'cell'
path = os.path.join(output_directory, id)

In [None]:
df_cell_open_access['Journal'] = journal

In [None]:
df_cell_open_access['ID'] = id + df_cell_open_access.index.astype(str).str.zfill(6)

In [None]:
df_cell_open_access.to_json(f"{output_directory}/cell_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_cell_open_access, path)

### [American Journal of Human Biology](https://onlinelibrary.wiley.com/journal/15206300?msockid=0525cb73d9a76a060b80df20d87e6b4b)

#### Import the data into a DataFrame

In [None]:
df_american_journal_human_biology_open_access = pd.read_json(f"{input_directory}/american_journal_human_biology_open_access.jsonl", lines=True)

In [None]:
df_american_journal_human_biology_open_access['Published'] = pd.to_datetime(df_american_journal_human_biology_open_access['Published'], unit='ms')

In [None]:
df_american_journal_human_biology_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'American Journal of Human Biology'
id = 'ajhb'
path = os.path.join(output_directory, id)

In [None]:
df_american_journal_human_biology_open_access['Journal'] = journal

In [None]:
df_american_journal_human_biology_open_access['ID'] = id + df_american_journal_human_biology_open_access.index.astype(str).str.zfill(6)

In [None]:
df_american_journal_human_biology_open_access.to_json(f"{output_directory}/american_journal_human_biology_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_american_journal_human_biology_open_access, path)

## Human Sciences

### [Annual Review of Anthropology](https://www.annualreviews.org/content/journals/anthro)

#### Import the data into a DataFrame

In [None]:
df_ar_anthropology = pd.read_json(f"{input_directory}/ar_anthropology.jsonl", lines=True)

In [None]:
# Extract the year using regex
df_ar_anthropology['Published'] = df_ar_anthropology['Vol/Year/Page Range'].str.extract(r'\((\d{4})\)')

In [None]:
df_ar_anthropology['Published'] = pd.to_datetime(
    df_ar_anthropology['Published'], format='mixed', errors='coerce'
)

In [None]:
df_ar_anthropology.rename(columns={
    'Vol/Year/Page Range': 'Vol/Issue',
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Annual Review of Anthropology'
id = 'aran'
path = os.path.join(output_directory, id)

In [None]:
df_ar_anthropology['Journal'] = journal

In [None]:
df_ar_anthropology['ID'] = id + df_ar_anthropology.index.astype(str).str.zfill(6)

In [None]:
df_ar_anthropology.to_json(f"{output_directory}/ar_anthropology.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs1(df_ar_anthropology, path)

### [Journal of Human Evolution](https://www.sciencedirect.com/journal/journal-of-human-evolution)

#### Import the data into a DataFrame

In [None]:
df_journal_human_evolution_open_access = pd.read_json(f"{input_directory}/journal_human_evolution_open_access.jsonl", lines=True)

In [None]:
df_journal_human_evolution_open_access['Published'] = pd.to_datetime(df_journal_human_evolution_open_access['Published'], unit='ms')

In [None]:
df_journal_human_evolution_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Journal of Human Evolution'
id = 'jhue'
path = os.path.join(output_directory, id)

In [None]:
df_journal_human_evolution_open_access['Journal'] = journal

In [None]:
df_journal_human_evolution_open_access['ID'] = id + df_journal_human_evolution_open_access.index.astype(str).str.zfill(6)

In [None]:
df_journal_human_evolution_open_access.to_json(f"{output_directory}/journal_human_evolution_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_journal_human_evolution_open_access, path)

## Applied Social Sciences

### [Journal of Applied Social Science](https://journals.sagepub.com/home/jax)

#### Import the data into a DataFrame

In [None]:
df_journal_applied_social_science_open_access = pd.read_json(f"{input_directory}/journal_applied_social_science_open_access.jsonl", lines=True)

In [None]:
df_journal_applied_social_science_open_access['Published'] = pd.to_datetime(df_journal_applied_social_science_open_access['Published'], unit='ms')

In [None]:
df_journal_applied_social_science_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Journal of Applied Social Science'
id = 'jasc'
path = os.path.join(output_directory, id)

In [None]:
df_journal_applied_social_science_open_access['Journal'] = journal

In [None]:
df_journal_applied_social_science_open_access['ID'] = id + df_journal_applied_social_science_open_access.index.astype(str).str.zfill(6)

In [None]:
df_journal_applied_social_science_open_access.to_json(f"{output_directory}/journal_applied_social_science_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_journal_applied_social_science_open_access, path)

### [Journal of Social Issues](https://spssi.onlinelibrary.wiley.com/journal/15404560)

#### Import the data into a DataFrame

In [None]:
df_journal_social_issues_open_access = pd.read_json(f"{input_directory}/journal_social_issues_open_access.jsonl", lines=True)

In [None]:
df_journal_social_issues_open_access['Published'] = pd.to_datetime(df_journal_social_issues_open_access['Published'], unit='ms')

In [None]:
df_journal_social_issues_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Journal of Social Issues'
id = 'jsoi'
path = os.path.join(output_directory, id)

In [None]:
df_journal_social_issues_open_access['Journal'] = journal

In [None]:
df_journal_social_issues_open_access['ID'] = id + df_journal_social_issues_open_access.index.astype(str).str.zfill(6)

In [None]:
df_journal_social_issues_open_access.to_json(f"{output_directory}/journal_social_issues_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_journal_social_issues_open_access, path)

### [Social Science & Medicine](https://www.sciencedirect.com/journal/social-science-and-medicine)

#### Import the data into a DataFrame

In [None]:
df_social_science_medicine_open_access = pd.read_json(f"{input_directory}/social_science_medicine_open_access.jsonl", lines=True)

In [None]:
df_social_science_medicine_open_access['Published'] = pd.to_datetime(df_social_science_medicine_open_access['Published'], unit='ms')

In [None]:
df_social_science_medicine_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Social Science & Medicine'
id = 'socm'
path = os.path.join(output_directory, id)

In [None]:
df_social_science_medicine_open_access['Journal'] = journal

In [None]:
df_social_science_medicine_open_access['ID'] = id + df_social_science_medicine_open_access.index.astype(str).str.zfill(6)

In [None]:
df_social_science_medicine_open_access.to_json(f"{output_directory}/social_science_medicine_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_social_science_medicine_open_access, path)

## Linguistics, literature and arts

### [Applied Corpus Linguistics](https://www.sciencedirect.com/journal/applied-corpus-linguistics)

#### Import the data into a DataFrame

In [None]:
df_applied_corpus_linguistics_open_access = pd.read_json(f"{input_directory}/applied_corpus_linguistics_open_access.jsonl", lines=True)

In [None]:
df_applied_corpus_linguistics_open_access['Published'] = pd.to_datetime(df_applied_corpus_linguistics_open_access['Published'], unit='ms')

In [None]:
df_applied_corpus_linguistics_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Applied Corpus Linguistics'
id = 'apcl'
path = os.path.join(output_directory, id)

In [None]:
df_applied_corpus_linguistics_open_access['Journal'] = journal

In [None]:
df_applied_corpus_linguistics_open_access['ID'] = id + df_applied_corpus_linguistics_open_access.index.astype(str).str.zfill(6)

In [None]:
df_applied_corpus_linguistics_open_access.to_json(f"{output_directory}/applied_corpus_linguistics_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_applied_corpus_linguistics_open_access, path)

### [Journal of English Linguistics](https://journals.sagepub.com/home/eng)

#### Import the data into a DataFrame

In [None]:
df_journal_english_linguistics_open_access = pd.read_json(f"{input_directory}/journal_english_linguistics_open_access.jsonl", lines=True)

In [None]:
df_journal_english_linguistics_open_access['Published'] = pd.to_datetime(df_journal_english_linguistics_open_access['Published'], unit='ms')

In [None]:
df_journal_english_linguistics_open_access.rename(columns={
    'Area of Knowledge': 'Discipline'
}, inplace=True)

#### Enrich the DataFrame, export to a file and scrape the HTML documents

In [None]:
journal = 'Journal of English Linguistics'
id = 'jenl'
path = os.path.join(output_directory, id)

In [None]:
df_journal_english_linguistics_open_access['Journal'] = journal

In [None]:
df_journal_english_linguistics_open_access['ID'] = id + df_journal_english_linguistics_open_access.index.astype(str).str.zfill(6)

In [None]:
df_journal_english_linguistics_open_access.to_json(f"{output_directory}/journal_english_linguistics_open_access.jsonl", orient='records', lines=True)

In [None]:
scrape_html_docs2(df_journal_english_linguistics_open_access, path)

## Concatenate the DataFrames for inspection

In [None]:
df_concatenated = pd.concat([
    df_nature_medicine_open_access,
    df_ar_public_health,
    df_lancet_public_health_open_access,
    df_new_england_journal_of_medicine_open_access,
    df_cell_open_access,
    df_american_journal_human_biology_open_access,
    df_ar_anthropology,
    df_journal_human_evolution_open_access,
    df_journal_applied_social_science_open_access,
    df_journal_social_issues_open_access,
    df_social_science_medicine_open_access,
    df_applied_corpus_linguistics_open_access,
    df_journal_english_linguistics_open_access
], ignore_index=True)

In [None]:
df_concatenated

### Export to a file

In [None]:
df_concatenated.to_excel(f"{output_directory}/df_concatenated.xlsx")