<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 3.1 - eyamrog

The aim of this phase is to develop solutions to scrape text from each journal's article HTML page.

## Required Python packages

- beautifulsoup4
- lxml
- pandas
- requests
- selenium
- tqdm

## Import the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import sys
import time
import logging
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Define input variables

In [2]:
input_directory = 'cl_st2_ph2_eyamrog'
output_directory = 'cl_st2_ph31_eyamrog'

## Create output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory already exists.


### Create output subdirectories

In [3]:
def create_directory(path):
    """Creates a subdirectory if it doesn't exist."""
    if not os.path.exists(path):
        try:
            os.makedirs(path)
            print(f"Successfully created the directory: {path}")
        except OSError as e:
            print(f"Failed to create the {path} directory: {e}")
            sys.exit(1)
    else:
        print(f"Directory already exists: {path}")

## Set up logging

In [4]:
log_filename = f"{output_directory}/{output_directory}.log"

In [5]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_filename
)

## Health Sciences

### [Nature Medicine](https://www.nature.com/nm/)

#### Create output subdirectory

In [6]:
# 'Nature Medicine'
id = 'natm'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\natm


#### Import the data into a DataFrame

In [7]:
df_nature_medicine_open_access = pd.read_json(f"{input_directory}/nature_medicine_open_access.jsonl", lines=True)

In [8]:
df_nature_medicine_open_access['Published'] = pd.to_datetime(df_nature_medicine_open_access['Published'], unit='ms')

In [9]:
df_nature_medicine_open_access = df_nature_medicine_open_access.loc[:4]

In [10]:
natm_urls = df_nature_medicine_open_access['URL'].tolist()
natm_urls

['https://www.nature.com/articles/s41591-022-02075-9',
 'https://www.nature.com/articles/s41591-022-02109-2',
 'https://www.nature.com/articles/s41591-022-02049-x',
 'https://www.nature.com/articles/s41591-022-02051-3',
 'https://www.nature.com/articles/s41591-022-02046-0']

In [13]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the title
        title_tag = soup.find('h1', class_='c-article-title', attrs={'data-test': 'article-title'})
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"{title}\n"

        # Extract the article body
        article_body_tag = soup.find('div', class_='c-article-body')
        if article_body_tag:
            for section in article_body_tag.find_all('section'):
                # Extract section title (h2)
                section_title_tag = section.find('h2', class_='c-article-section__title')
                if section_title_tag:
                    section_title = ' '.join(section_title_tag.get_text(' ', strip=True).split())
                    text += f"{section_title}\n"

                # Extract subsection title (h3)
                subsection_title_tag = section.find('h3', class_='c-article__sub-heading')
                if subsection_title_tag:
                    subsection_title = ' '.join(subsection_title_tag.get_text(' ', strip=True).split())
                    text += f"{subsection_title}\n"

                # Extract paragraphs
                for paragraph in section.find_all('p'):
                    # Remove <sup> elements containing references
                    for sup_tag in paragraph.find_all('sup'):
                        sup_tag.decompose()  # Completely removes the element

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [14]:
extract_text(df_nature_medicine_open_access, path)

### [Annual Review of Public Health](https://www.annualreviews.org/content/journals/publhealth)

#### Create output subdirectory

In [15]:
# 'Annual Review of Public Health'
id = 'arph'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\arph


#### Import the data into a DataFrame

In [16]:
df_ar_public_health = pd.read_json(f"{input_directory}/ar_public_health.jsonl", lines=True)

In [17]:
df_ar_public_health['Published'] = pd.to_datetime(df_ar_public_health['Published'], unit='ms')

In [18]:
df_ar_public_health = df_ar_public_health.loc[:4]

In [19]:
arph_urls = df_ar_public_health['URL'].tolist()
arph_urls

['https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-121019-053834',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-051920-114020',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-012420-105104',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-051920-110928',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-publhealth-060220-042648']

In [34]:
def extract_text(df, path):
    """Extracts text from structured HTML pages and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the title
        title_tag = soup.find('span', class_='article-title')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"{title}\n"

        # Extract article sections
        for section in soup.find_all('div', class_='articleSection'):
            # Extract section title
            section_title_tag = section.find('div', class_='tl-main-part title')
            if section_title_tag:
                section_title = ' '.join(section_title_tag.get_text(' ', strip=True).split())
                if section_title not in text: # Avoid duplicates
                    text += f"{section_title}\n"

            # Extract paragraphs
            for paragraph in section.find_all('p'):
                paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                text += f"{paragraph_text}\n"

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [35]:
extract_text(df_ar_public_health, path)

### [Lancet Public Health](https://www.thelancet.com/journals/lanpub/home)

#### Create output subdirectory

In [6]:
# 'Lancet Public Health'
id = 'laph'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\laph


#### Import the data into a DataFrame

In [7]:
df_lancet_public_health_open_access = pd.read_json(f"{input_directory}/lancet_public_health_open_access.jsonl", lines=True)

In [8]:
df_lancet_public_health_open_access['Published'] = pd.to_datetime(df_lancet_public_health_open_access['Published'], unit='ms')

In [9]:
df_lancet_public_health_open_access = df_lancet_public_health_open_access.loc[:4]

In [10]:
laph_urls = df_lancet_public_health_open_access['URL'].tolist()
laph_urls

['https://www.thelancet.com/journals/lanpub/article/PIIS2468-2667(19)30188-4/fulltext',
 'https://www.thelancet.com/journals/lanpub/article/PIIS2468-2667(19)30219-1/fulltext',
 'https://www.thelancet.com/journals/lanpub/article/PIIS2468-2667(19)30226-9/fulltext',
 'https://www.thelancet.com/journals/lanpub/article/PIIS2468-2667(19)30231-2/fulltext',
 'https://www.thelancet.com/journals/lanpub/article/PIIS2468-2667(19)30230-0/fulltext']

In [11]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the title
        title_tag = soup.find('h1', property='name')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"{title}\n"

        # Extract the abstract
        abstract_section = soup.find('section', property='abstract')
        if abstract_section:
            summary_header_tag = abstract_section.find('h2', property='name')
            if summary_header_tag:
                summary_header = ' '.join(summary_header_tag.get_text(' ', strip=True).split())
                text += f"{summary_header}\n"

            for subsection in abstract_section.find_all('section'):
                subsection_title_tag = subsection.find('h3')
                if subsection_title_tag:
                    subsection_title = ' '.join(subsection_title_tag.get_text(' ', strip=True).split())
                    text += f"{subsection_title}\n"

                # Extract paragraphs within each section
                for paragraph in subsection.find_all('div', role='paragraph'):
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Extract the article body
        body_section = soup.find('section', property='articleBody')
        if body_section:
            for section in body_section.find_all('section'):
                section_text = ''  # Reset for each section

                # Extract section title (h2)
                section_title_tag = section.find('h2')
                if section_title_tag:
                    section_title = ' '.join(section_title_tag.get_text(' ', strip=True).split())
                    section_text += f"{section_title}\n"

                # Extract subsection title (h3)
                subsection_title_tag = section.find('h3')
                if subsection_title_tag:
                    subsection_title = ' '.join(subsection_title_tag.get_text(' ', strip=True).split())
                    section_text += f"{subsection_title}\n"

                # Extract paragraphs
                for paragraph in section.find_all('div', role='paragraph'):
                    # Remove reference citations embedded in <span> tags
                    for ref_tag in paragraph.find_all('span', class_='dropBlock reference-citations'):
                        ref_tag.decompose()  # Completely removes the element

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    section_text += f"{paragraph_text}\n"

                text += section_text  # Append structured section text

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [12]:
extract_text(df_lancet_public_health_open_access, path)

### [New England Journal of Medicine](https://www.nejm.org/)

#### Create output subdirectory

In [13]:
# 'New England Journal of Medicine'
id = 'nejm'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\nejm


#### Import the data into a DataFrame

In [14]:
df_new_england_journal_of_medicine_open_access = pd.read_json(f"{input_directory}/new_england_journal_of_medicine_open_access.jsonl", lines=True)

In [15]:
df_new_england_journal_of_medicine_open_access['Published'] = pd.to_datetime(df_new_england_journal_of_medicine_open_access['Published'], unit='ms')

In [16]:
df_new_england_journal_of_medicine_open_access = df_new_england_journal_of_medicine_open_access.loc[:4]

In [17]:
nejm_urls = df_new_england_journal_of_medicine_open_access['URL'].tolist()
nejm_urls

['https://www.nejm.org/doi/full/10.1056/NEJMoa1910355',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa1817591',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa1908490',
 'https://www.nejm.org/doi/full/10.1056/NEJMoa1913662',
 'https://www.nejm.org/doi/full/10.1056/NEJMsa1901383']

In [26]:
def extract_text(df, path):
    """Extracts text from HTML files and saves as text files."""

    for article_id in df['ID']:
        html_file = os.path.join(path, f"{article_id}.html")
        txt_file = os.path.join(path, f"{article_id}.txt")

        # Check if the HTML file exists
        if not os.path.exists(html_file):
            logging.error(f"Skipping {html_file}: File not found")
            continue

        # Read HTML content
        with open(html_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')

        # Initialise text variable
        text = ''

        # Extract the title
        title_tag = soup.find('h1', property='name')
        if title_tag:
            title = ' '.join(title_tag.get_text(' ', strip=True).split())
            text += f"{title}\n"

        # Extract the abstract
        abstract_section = soup.find('section', property='abstract')
        if abstract_section:
            abstract_tag = abstract_section.find('h2', property='name')
            if abstract_tag:
                abstract = ' '.join(abstract_tag.get_text(' ', strip=True).split())
                text += f"{abstract}\n"

            for section_h3 in abstract_section.find_all('section', recursive=False):
                section_h3_title_tag = section_h3.find('h3')
                if section_h3_title_tag:
                    section_h3_title = ' '.join(section_h3_title_tag.get_text(' ', strip=True).split())
                    text += f"{section_h3_title}\n"

                # Extract paragraphs within each section
                paragraphs = section_h3.find_all('div', role='paragraph')
                for paragraph in paragraphs:
                    # Remove reference citations embedded in <span> tags
                    for sup_tag in paragraph.find_all('sup'):
                        sup_tag.decompose()  # Completely removes the element

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

        # Extract the article body
        body_section = soup.find('section', property='articleBody')
        if body_section:
            body_section_core_container = body_section.find('div', class_='core-container')
            if body_section_core_container:
                # Extract the initial paragraphs that precede the first section (introduction)
                text += f"Introduction\n" # Insert the 'Introduction' title
                paragraphs = body_section_core_container.find_all('div', role='paragraph', recursive=False) # Prevents nested extraction
                for paragraph in paragraphs:
                    # Remove reference citations embedded in <span> tags
                    for sup_tag in paragraph.find_all('sup'):
                        sup_tag.decompose()  # Completely removes the element

                    # Extract the paragraph text
                    paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                    text += f"{paragraph_text}\n"

                # Extract sectioned content
                for section_h2 in body_section_core_container.find_all('section', recursive=False):
                    section_text = ''  # Reset for each section

                    # Extract section title (h2)
                    section_h2_title_tag = section_h2.find('h2')
                    if section_h2_title_tag:
                        section_h2_title = ' '.join(section_h2_title_tag.get_text(' ', strip=True).split())
                        section_text += f"{section_h2_title}\n"
                    
                    # Extract h2 paragraphs, if there are any
                    paragraphs = section_h2.find_all('div', role='paragraph', recursive=False)
                    for paragraph in paragraphs:
                        # Remove reference citations embedded in <span> tags
                        for sup_tag in paragraph.find_all('sup'):
                            sup_tag.decompose()  # Completely removes the element

                        # Extract the paragraph text
                        paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                        section_text += f"{paragraph_text}\n"

                    for section_h3 in section_h2.find_all('section'):
                        # Extract subsection title (h3)
                        section_h3_title_tag = section_h3.find('h3')
                        if section_h3_title_tag:
                            section_h3_title = ' '.join(section_h3_title_tag.get_text(' ', strip=True).split())
                            section_text += f"{section_h3_title}\n"

                        # Extract h3 paragraphs
                        paragraphs = section_h3.find_all('div', role='paragraph')
                        for paragraph in paragraphs:
                            # Remove reference citations embedded in <span> tags
                            for sup_tag in paragraph.find_all('sup'):
                                sup_tag.decompose()  # Completely removes the element

                            # Extract the paragraph text
                            paragraph_text = ' '.join(paragraph.get_text(' ', strip=True).split())
                            section_text += f"{paragraph_text}\n"

                    text += section_text  # Append structured section text

        # Save text to a text file
        with open(txt_file, 'w', encoding='utf-8', newline='\n') as file:
            file.write(text)

        logging.info(f"Saved text for {article_id} to {txt_file}")

In [27]:
extract_text(df_new_england_journal_of_medicine_open_access, path)

## Biological Sciences

### [Cell](https://www.cell.com/cell/home)

#### Create output subdirectory

In [30]:
# 'Cell'
id = 'cell'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\cell


#### Import the data into a DataFrame

In [31]:
df_cell_open_access = pd.read_json(f"{input_directory}/cell_open_access.jsonl", lines=True)

In [32]:
df_cell_open_access['Published'] = pd.to_datetime(df_cell_open_access['Published'], unit='ms')

In [33]:
df_cell_open_access = df_cell_open_access.loc[:4]

In [34]:
cell_urls = df_cell_open_access['URL'].tolist()
cell_urls

['https://www.cell.com/cell/fulltext/S0092-8674(19)31270-X',
 'https://www.cell.com/cell/fulltext/S0092-8674(19)31378-9',
 'https://www.cell.com/cell/fulltext/S0092-8674(19)31328-5',
 'https://www.cell.com/cell/fulltext/S0092-8674(19)31283-8',
 'https://www.cell.com/cell/fulltext/S0092-8674(19)31317-0']

### [American Journal of Human Biology](https://onlinelibrary.wiley.com/journal/15206300?msockid=0525cb73d9a76a060b80df20d87e6b4b)

#### Create output subdirectory

In [35]:
# 'American Journal of Human Biology'
id = 'ajhb'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\ajhb


#### Import the data into a DataFrame

In [36]:
df_american_journal_human_biology_open_access = pd.read_json(f"{input_directory}/american_journal_human_biology_open_access.jsonl", lines=True)

In [37]:
df_american_journal_human_biology_open_access['Published'] = pd.to_datetime(df_american_journal_human_biology_open_access['Published'], unit='ms')

In [38]:
df_american_journal_human_biology_open_access = df_american_journal_human_biology_open_access.loc[:4]

In [39]:
ajhb_urls = df_american_journal_human_biology_open_access['URL'].tolist()
ajhb_urls

['https://onlinelibrary.wiley.com/doi/10.1002/ajhb.23389',
 'https://onlinelibrary.wiley.com/doi/10.1002/ajhb.23350',
 'https://onlinelibrary.wiley.com/doi/10.1002/ajhb.23340',
 'https://onlinelibrary.wiley.com/doi/10.1002/ajhb.23339',
 'https://onlinelibrary.wiley.com/doi/10.1002/ajhb.23407']

## Human Sciences

### [Annual Review of Anthropology](https://www.annualreviews.org/content/journals/anthro)

#### Create output subdirectory

In [40]:
# 'Annual Review of Anthropology'
id = 'aran'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\aran


#### Import the data into a DataFrame

In [41]:
df_ar_anthropology = pd.read_json(f"{input_directory}/ar_anthropology.jsonl", lines=True)

In [42]:
df_ar_anthropology['Published'] = pd.to_datetime(df_ar_anthropology['Published'], unit='ms')

In [43]:
df_ar_anthropology = df_ar_anthropology.loc[:4]

In [44]:
aran_urls = df_ar_anthropology['URL'].tolist()
aran_urls

['https://www.annualreviews.org/content/journals/10.1146/annurev-an-51-082222-100001',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-070120-111609',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-041420-112543',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-041420-102158',
 'https://www.annualreviews.org/content/journals/10.1146/annurev-anthro-041420-013930']

### [Journal of Human Evolution](https://www.sciencedirect.com/journal/journal-of-human-evolution)

#### Create output subdirectory

In [45]:
# 'Journal of Human Evolution'
id = 'jhue'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\jhue


#### Import the data into a DataFrame

In [46]:
df_journal_human_evolution_open_access = pd.read_json(f"{input_directory}/journal_human_evolution_open_access.jsonl", lines=True)

In [47]:
df_journal_human_evolution_open_access['Published'] = pd.to_datetime(df_journal_human_evolution_open_access['Published'], unit='ms')

In [48]:
df_journal_human_evolution_open_access = df_journal_human_evolution_open_access.loc[:4]

In [49]:
jhue_urls = df_journal_human_evolution_open_access['URL'].tolist()
jhue_urls

['https://www.sciencedirect.com//science/article/pii/S0047248420300294',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301123',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301135',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301305',
 'https://www.sciencedirect.com//science/article/pii/S0047248420301615']

## Applied Social Sciences

### [Journal of Applied Social Science](https://journals.sagepub.com/home/jax)

#### Create output subdirectory

In [50]:
# 'Journal of Applied Social Science'
id = 'jasc'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\jasc


#### Import the data into a DataFrame

In [51]:
df_journal_applied_social_science_open_access = pd.read_json(f"{input_directory}/journal_applied_social_science_open_access.jsonl", lines=True)

In [52]:
df_journal_applied_social_science_open_access['Published'] = pd.to_datetime(df_journal_applied_social_science_open_access['Published'], unit='ms')

In [53]:
df_journal_applied_social_science_open_access = df_journal_applied_social_science_open_access.loc[:4]

In [54]:
jasc_urls = df_journal_applied_social_science_open_access['URL'].tolist()
jasc_urls

['https://journals.sagepub.com/doi/abs/10.1177/1936724420980374',
 'https://journals.sagepub.com/doi/abs/10.1177/19367244211003471',
 'https://journals.sagepub.com/doi/abs/10.1177/1936724421998275',
 'https://journals.sagepub.com/doi/abs/10.1177/19367244211000709',
 'https://journals.sagepub.com/doi/abs/10.1177/19367244211000271']

### [Journal of Social Issues](https://spssi.onlinelibrary.wiley.com/journal/15404560)

#### Create output subdirectory

In [55]:
# 'Journal of Social Issues'
id = 'jsoi'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\jsoi


#### Import the data into a DataFrame

In [56]:
df_journal_social_issues_open_access = pd.read_json(f"{input_directory}/journal_social_issues_open_access.jsonl", lines=True)

In [57]:
df_journal_social_issues_open_access['Published'] = pd.to_datetime(df_journal_social_issues_open_access['Published'], unit='ms')

In [58]:
df_journal_social_issues_open_access = df_journal_social_issues_open_access.loc[:4]

In [59]:
jsoi_urls = df_journal_social_issues_open_access['URL'].tolist()
jsoi_urls

['https://spssi.onlinelibrary.wiley.com/doi/10.1111/josi.12376',
 'https://spssi.onlinelibrary.wiley.com/doi/10.1111/josi.12369',
 'https://spssi.onlinelibrary.wiley.com/doi/10.1111/josi.12360',
 'https://spssi.onlinelibrary.wiley.com/doi/10.1111/josi.12398',
 'https://spssi.onlinelibrary.wiley.com/doi/10.1111/josi.12399']

### [Social Science & Medicine](https://www.sciencedirect.com/journal/social-science-and-medicine)

#### Create output subdirectory

In [60]:
# 'Social Science & Medicine'
id = 'socm'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\socm


#### Import the data into a DataFrame

In [61]:
df_social_science_medicine_open_access = pd.read_json(f"{input_directory}/social_science_medicine_open_access.jsonl", lines=True)

In [62]:
df_social_science_medicine_open_access['Published'] = pd.to_datetime(df_social_science_medicine_open_access['Published'], unit='ms')

In [63]:
df_social_science_medicine_open_access = df_social_science_medicine_open_access.loc[:4]

In [64]:
socm_urls = df_social_science_medicine_open_access['URL'].tolist()
socm_urls

['https://www.sciencedirect.com//science/article/pii/S0277953619305933',
 'https://www.sciencedirect.com//science/article/pii/S0277953619306288',
 'https://www.sciencedirect.com//science/article/pii/S0277953619306379',
 'https://www.sciencedirect.com//science/article/pii/S0277953619306434',
 'https://www.sciencedirect.com//science/article/pii/S0277953619306628']

## Linguistics, literature and arts

### [Applied Corpus Linguistics](https://www.sciencedirect.com/journal/applied-corpus-linguistics)

#### Create output subdirectory

In [65]:
# 'Applied Corpus Linguistics'
id = 'apcl'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\apcl


#### Import the data into a DataFrame

In [66]:
df_applied_corpus_linguistics_open_access = pd.read_json(f"{input_directory}/applied_corpus_linguistics_open_access.jsonl", lines=True)

In [67]:
df_applied_corpus_linguistics_open_access['Published'] = pd.to_datetime(df_applied_corpus_linguistics_open_access['Published'], unit='ms')

In [68]:
df_applied_corpus_linguistics_open_access = df_applied_corpus_linguistics_open_access.loc[:4]

In [69]:
apcl_urls = df_applied_corpus_linguistics_open_access['URL'].tolist()
apcl_urls

['https://www.sciencedirect.com//science/article/pii/S2666799121000010',
 'https://www.sciencedirect.com//science/article/pii/S2666799121000083',
 'https://www.sciencedirect.com//science/article/pii/S2666799121000101',
 'https://www.sciencedirect.com//science/article/pii/S2666799121000113',
 'https://www.sciencedirect.com//science/article/pii/S266679912200003X']

### [Journal of English Linguistics](https://journals.sagepub.com/home/eng)

#### Create output subdirectory

In [70]:
# 'Journal of English Linguistics'
id = 'jenl'
path = os.path.join(output_directory, id)
create_directory(path)

Directory already exists: cl_st2_ph31_eyamrog\jenl


#### Import the data into a DataFrame

In [71]:
df_journal_english_linguistics_open_access = pd.read_json(f"{input_directory}/journal_english_linguistics_open_access.jsonl", lines=True)

In [72]:
df_journal_english_linguistics_open_access['Published'] = pd.to_datetime(df_journal_english_linguistics_open_access['Published'], unit='ms')

In [73]:
df_journal_english_linguistics_open_access = df_journal_english_linguistics_open_access.loc[:4]

In [74]:
jenl_urls = df_journal_english_linguistics_open_access['URL'].tolist()
jenl_urls

['https://journals.sagepub.com/doi/abs/10.1177/0075424220911067',
 'https://journals.sagepub.com/doi/abs/10.1177/0075424220938949',
 'https://journals.sagepub.com/doi/abs/10.1177/0075424220945008',
 'https://journals.sagepub.com/doi/abs/10.1177/0075424220982063',
 'https://journals.sagepub.com/doi/abs/10.1177/0075424220982649']