In [1]:
! pip install fake_useragent

Collecting fake_useragent
  Downloading fake_useragent-1.3.0-py3-none-any.whl (15 kB)
Installing collected packages: fake_useragent
Successfully installed fake_useragent-1.3.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd '/content/drive/My Drive/HSE/practice'
! ls

/content/drive/My Drive/HSE/practice
crawler.ipynb					     PR_Language-and-Cognition.csv
PR_Constructions-and-Frames.csv			     PR_Open-Linguistics.csv
PR_International-Journal-of-Corpus-Linguistics.csv   PR_Research-in-Language.csv
PR_Journal-of-African-Languages-and-Linguistics.csv  Журналы.gsheet
PR_Journal-of-Pidgin-and-Creole-Languages.csv	     Практика.gsheet


In [4]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
import time
import random
from tqdm.auto import tqdm
ua = UserAgent()
session = requests.session()

## John Benjamins e-Platform (https://www.jbe-platform.com/)

In [None]:
def get_issues(volume_url):
  head = 'https://www.jbe-platform.com'
  volume_url = head + volume_url
  if 'volume' not in volume_url:
    return [volume_url]
  req = session.get(volume_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  issues = [head + a['href'] for a in soup.find_all('a')]
  return issues

In [None]:
def parse_home_page(home_url):
  req = session.get(home_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  journal = soup.find('meta', {'name': 'citation_title'})['content']
  issue_bar = soup.find('ul', {'class': 'issue-list'})
  volume_urls = [el.attrs['data-ahah-href'] if el.has_attr('data-ahah-href') else el.attrs['href'] for el in issue_bar.find_all('a')]
  issue_urls = [issue for volume in volume_urls for issue in get_issues(volume)]
  return issue_urls, journal

# journal_number = str(15699811)
# #journal_number = str(18761941)
# url = f'https://www.jbe-platform.com/content/journals/{journal_number}/browse?page=previous-issues'
# parse_home_page(url)

In [None]:
def parse_issue_page(issue_url):
  head = 'https://www.jbe-platform.com'
  req = session.get(issue_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  year = soup.find('h2', {'class': 'issueTitle'})
  year = int(year.text.strip()[-4:])
  issue_bar = soup.find('div', {'class': 'issueToc'})
  articles = issue_bar.find_all('span', {'class': 'articleTitle'})
  article_urls = [head + article.find('a')['href'] for article in articles]
  return year, article_urls

# for i in range(14, 0, -1):
#   issue_url = f'https://www.jbe-platform.com/content/journals/{journal_number}/{i}/2'
#   print(parse_issue_page(issue_url))
#   issue_url = f'https://www.jbe-platform.com/content/journals/{journal_number}/{i}/1'
#   print(parse_issue_page(issue_url))

In [None]:
def parse_article(article_url, year, journal):
  req = session.get(article_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  title_box = soup.find('div', {'class': 'title-box'})
  title = title_box.find('li')
  icon_tag = title.find('span', {'class': 'keyicon'})
  if icon_tag:
    icon_tag.clear()
  title = title_box.find_all(['h1', 'h2'])
  title = '. '.join([part.text.strip() for part in title])
  authors = title_box.find_all('a', {'class': 'nonDisambigAuthorLink'})
  authors = '; '.join([author.text for author in authors])
  doi = title_box.find('span', {'class': 'meta-value doi'}).find('a')['href']
  abstract = soup.find('div', {'id': 'abstract_content'})
  abstract_text = abstract.find('div', {'class': 'description'})
  if not abstract_text:
    abstract_text = abstract.find('p')
  abstract_text = abstract_text.text.lstrip('Abstract\n').strip()
  return [doi, authors, title, year, abstract_text, journal]

In [None]:
article_url = 'https://www.jbe-platform.com/content/journals/10.1075/cf.8.1.01cap'
req = session.get(article_url, headers={'User-Agent': ua.random})
page = req.text
soup = BeautifulSoup(page, 'html.parser')
abstract = soup.find('div', {'class': 'articleabstract'})
type(abstract)

KeyboardInterrupt: ignored

In [None]:
def run_all(url):
  issues, journal = parse_home_page(url)
  articles = []
  for i in tqdm(range(len(issues))):
    time.sleep(random.uniform(1.1, 5.2))
    articles.append(parse_issue_page(issues[i]))
  abstracts = []
  for year, issue in tqdm(articles):
    for article in issue:
      time.sleep(random.uniform(1.1, 5.2))
      parsed_article = parse_article(article, year, journal)
      abstracts.append(parsed_article)
  return abstracts

journal_number = str(15699870)
#journal_number = str(15699811)
#journal_number = str(18761941)
url = f'https://www.jbe-platform.com/content/journals/{journal_number}/browse?page=previous-issues'
abstracts = run_all(url)
abstracts

In [None]:
import csv

journal = '-'.join(abstracts[0][5].split())
file_name = f'PR_{journal}.csv'
with open(file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['doi', 'author', 'title', 'year', 'abstract', 'journal'])
    writer.writerows(abstracts)

## Taylor and Francis Online (https://www.tandfonline.com/)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
from tqdm import tqdm

def get_issues(volume_url):
    head = 'https://www.tandfonline.com'
    volume_url = head + volume_url
    driver.get(volume_url)
    issues = [head + a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'a.issue-link')]
    return issues

def parse_home_page(home_url):
    driver.get(home_url)
    volume_urls = [el.find_element(By.TAG_NAME, 'a').get_attribute('href') for el in driver.find_elements(By.CSS_SELECTOR, 'li.vol_li')]
    issue_urls = [issue for volume in volume_urls for issue in get_issues(volume)]
    return issue_urls

def parse_issue_page(issue_url):
    head = 'https://www.tandfonline.com'
    driver.get(issue_url)
    articles = driver.find_elements(By.CSS_SELECTOR, 'div.articleEntry')
    articles_info = [(article.find_element(By.CSS_SELECTOR, 'span.date').text[-4:], head + article.find_element(By.TAG_NAME, 'a').get_attribute('href')) for article in articles]
    return articles_info

def parse_article(article_url, year, journal):
    driver.get(article_url)
    title = driver.find_element(By.CSS_SELECTOR, 'meta[name="dc.Title"]').get_attribute('content')
    authors = driver.find_elements(By.CSS_SELECTOR, 'meta[name="dc.Creator"]')
    authors = '; '.join([author.get_attribute('content') for author in authors])
    doi = driver.find_element(By.CSS_SELECTOR, 'meta[name="dc.Source"]').get_attribute('content')
    abstract_text = driver.find_element(By.CSS_SELECTOR, 'div.abstractSection').text[8:]
    return [doi, authors, title, year, abstract_text, journal]

def run_all(url):
    issues = parse_home_page(url)
    articles = []
    for i in tqdm(range(len(issues))):
        time.sleep(random.uniform(1.1, 5.2))
        articles.append(parse_issue_page(issues[i]))
    abstracts = []
    for year, issue in tqdm(articles):
        for article in issue:
            time.sleep(random.uniform(1.1, 5.2))
            journal = 'International Journal of Multilingualism'
            parsed_article = parse_article(article, year, journal)
            abstracts.append(parsed_article)
    return abstracts

journal_code = 'rmjm20'
url = f'https://www.tandfonline.com/loi/{journal_code}'

# Инициализация драйвера
driver = webdriver.Chrome()

abstracts = run_all(url)

# Закрыть драйвер после завершения
driver.quit()

print(abstracts)


In [8]:
!apt-get update
!apt install -yq chromium-chromedriver
!pip install selenium

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to cloud.r-project.org (108.157.162.103)]                                                                                Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
0% [Waiting for headers] [2 InRelease 14.2 kB/110 kB 13%] [Connecting to cloud.                                                                               Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,347 kB]
Hit:9 https://ppa.launchpadconte

In [9]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [12]:
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

In [17]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def parse_home_page(home_url):
    driver.get(home_url)

    volume_urls = []

    # Находим все кнопки volume_link
    volume_buttons = driver.find_elements(By.CSS_SELECTOR, 'button.volume_link')
    for button in volume_buttons:
        button.click()

        # Ждем, пока ссылка станет видимой
        try:
            link_element = WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, 'li.vol_li a'))
            )
            volume_urls.append(link_element.get_attribute('href'))
        except TimeoutException:
            # Если ссылка не стала видимой в течение 10 секунд, продолжаем со следующей кнопкой
            continue

    issue_urls = [issue for volume in volume_urls for issue in get_issues(volume)]
    return issue_urls

journal_code = 'rmjm20'
url = f'https://www.tandfonline.com/loi/{journal_code}'
parse_home_page(url)

[]

In [5]:
headers = {
    'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
}

In [7]:
def get_issues(volume_url):
  head = 'https://www.tandfonline.com'
  volume_url = head + volume_url
  req = session.get(volume_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  issues = [head + a['href'] for a in soup.find_all('a', {'class': 'issue-link'})]
  return issues

In [6]:
def parse_home_page(home_url):
  # req = session.get(home_url, headers={'User-Agent': ua.random})
  req = session.get(home_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  res = soup.find('li', {'class': 'vol_li'})
  # journal = soup.find('a', {'class': 'compact-nav--title'}).text.strip()
  # volume_urls = [el.find('a')['href'] for el in soup.find_all('li', {'class': 'vol_li'})]
  # issue_urls = [issue for volume in volume_urls for issue in get_issues(volume)]
  # return issue_urls, journal
  # return issue_urls
  return res

journal_code = 'rmjm20'
url = f'https://www.tandfonline.com/loi/{journal_code}'
parse_home_page(url)

In [None]:
def parse_issue_page(issue_url):
  head = 'https://www.tandfonline.com'
  req = session.get(issue_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  articles = soup.find_all('div', {'class': 'articleEntry'})
  articles_info = [(article.find('span', {'class': 'date'})[-4:], head + article.find('a')['href']) for article in articles]
  return articles_info

In [None]:
def parse_article(article_url, year, journal):
  req = session.get(article_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  title = soup.find('meta', {'name': 'dc.Title'})['content']
  authors = soup.find_all('meta', {'name': 'dc.Creator'})
  authors = '; '.join([author['content'] for author in authors])
  doi = soup.find('meta', {'name': 'dc.Source'})['content']
  abstract_text = soup.find('div', {'class': 'abstractSection'}).text[8:]
  return [doi, authors, title, year, abstract_text, journal]

In [None]:
def run_all(url):
  #issues, journal = parse_home_page(url)
  issues = parse_home_page(url)
  articles = []
  for i in tqdm(range(len(issues))):
    time.sleep(random.uniform(1.1, 5.2))
    articles.append(parse_issue_page(issues[i]))
  abstracts = []
  for year, issue in tqdm(articles):
    for article in issue:
      time.sleep(random.uniform(1.1, 5.2))
      journal = 'International Journal of Multilingualism'
      parsed_article = parse_article(article, year, journal)
      abstracts.append(parsed_article)
  return abstracts

journal_code = 'rmjm20'
url = f'https://www.tandfonline.com/loi/{journal_code}'
abstracts = run_all(url)
abstracts

0it [00:00, ?it/s]

0it [00:00, ?it/s]

[]

In [None]:
import csv

journal = '-'.join(abstracts[0][5].split())
file_name = f'PR_{journal}.csv'
with open(file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['doi', 'author', 'title', 'year', 'abstract', 'journal'])
    writer.writerows(abstracts)

## DE GRUYTER (https://www.degruyter.com/)

In [5]:
def parse_home_page(home_url):
  req = session.get(home_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  journal = soup.find('meta', {'property': 'og:title'})['content']
  issue_urls = [issue.find('a')['href'] for issue in soup.find_all('li', {'class': 'issue pb-3'})]
  return issue_urls, journal

home_url = 'https://www.degruyter.com/journal/key/jall/html#issues'
parse_home_page(home_url)

(['/journal/key/jall/44/1/html',
  '/journal/key/jall/43/2/html',
  '/journal/key/jall/43/1/html',
  '/journal/key/jall/42/2/html',
  '/journal/key/jall/42/1/html',
  '/journal/key/jall/41/2/html',
  '/journal/key/jall/41/1/html',
  '/journal/key/jall/40/2/html',
  '/journal/key/jall/40/1/html',
  '/journal/key/jall/39/2/html',
  '/journal/key/jall/39/1/html',
  '/journal/key/jall/38/2/html',
  '/journal/key/jall/38/1/html',
  '/journal/key/jall/37/2/html',
  '/journal/key/jall/37/1/html',
  '/journal/key/jall/36/2/html',
  '/journal/key/jall/36/1/html',
  '/journal/key/jall/35/2/html',
  '/journal/key/jall/35/1/html',
  '/journal/key/jall/34/2/html',
  '/journal/key/jall/34/1/html',
  '/journal/key/jall/33/2/html',
  '/journal/key/jall/33/1/html',
  '/journal/key/jall/32/2/html',
  '/journal/key/jall/32/1/html',
  '/journal/key/jall/31/2/html',
  '/journal/key/jall/31/1/html',
  '/journal/key/jall/30/2/html',
  '/journal/key/jall/30/1/html',
  '/journal/key/jall/29/2/html',
  '/journa

In [6]:
def parse_issue_page(issue_url):
  head = 'https://www.degruyter.com'
  issue_url = head + issue_url
  req = session.get(issue_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  issue_bar = soup.find('div', {'class': 'issueSubjectGroup mb-4', 'id': 'issue-subject-group-articles'})
  if not issue_bar:
    issue_bar = soup.find('div', {'class': 'issueSubjectGroup mb-4'})
  if not issue_bar:
    articles = soup.find_all('a', {'class': 'issueContentsArticleLink'})
  else:
    articles = issue_bar.find_all('a', {'class': 'issueContentsArticleLink'})
  article_urls = [head + article['href'] for article in articles]
  return article_urls

issue_url = '/journal/key/jall/44/1/html'
parse_issue_page(issue_url)

['https://www.degruyter.com/document/doi/10.1515/jall-2023-2003/html',
 'https://www.degruyter.com/document/doi/10.1515/jall-2023-2005/html',
 'https://www.degruyter.com/document/doi/10.1515/jall-2023-2007/html',
 'https://www.degruyter.com/document/doi/10.1515/jall-2023-2006/html']

In [7]:
def parse_article(article_url, journal):
  req = session.get(article_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  title = soup.find('meta', {'property': 'og:title'})['content']
  year = int(soup.find('span', {'class': 'publicationDate'}).text[-4:])
  authors = soup.find_all('meta', {'property': 'article:author'})
  if authors:
    authors = [author['content'] for author in authors]
  authors = '; '.join(authors)
  doi = soup.find('div', {'class': 'doi'}).find('a')['href']
  abstract_text = soup.find('meta', {'name': 'description'})['content']
  return [doi, authors, title, year, abstract_text, journal]

article_url = 'https://www.degruyter.com/document/doi/10.1515/jall-2023-2005/html'
journal = 'Journal of African Languages and Linguistics'
parse_article(article_url, journal)

['https://doi.org/10.1515/jall-2023-2005',
 'Aron Finholt; John Gluckman',
 'A corpus analysis of Swahili’s dual-complementizer system',
 2023,
 'Tanzanian Swahili has two complementizers, kuwa and kwamba , both used to introduce finite embedded clauses. We explore whether the complementizers are in free variation, as reported in all descriptive and pedagogical work. Our study primarily relies on corpus data, which we supplement with native speaker judgments. We find that the complementizers are not in free variation, but in fact are\xa0affected by a number of factors known to affect embedded clauses cross-linguistically, including predicate class, person features of the main-clause subject, and mood in the embedded clause. We conclude that the complementizers ultimately reflect subtle, pragmatic factors concerning how the truth of the embedded clause should be evaluated. Our study expands on previous work on languages with so-called “dual-complementizer” systems.',
 'Journal of Africa

In [8]:
def run_all(url):
  issues, journal = parse_home_page(url)
  articles = []
  for issue in tqdm(issues):
    time.sleep(random.uniform(1.1, 5.2))
    articles.extend(parse_issue_page(issue))
  abstracts = []
  for article in tqdm(articles):
    time.sleep(random.uniform(1.1, 5.2))
    parsed_article = parse_article(article, journal)
    abstracts.append(parsed_article)
  abstracts = [abstract for abstract in abstracts if abstract[1]]
  return abstracts

#journal_number = 'jall'
journal_number = 'opli'
url = f'https://www.degruyter.com/journal/key/{journal_number}/html#issues'
abstracts = run_all(url)
abstracts[:3]

TypeError: ignored

In [9]:
import csv

journal = '-'.join(abstracts[0][5].split())
file_name = f'PR_{journal}.csv'
with open(file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['doi', 'author', 'title', 'year', 'abstract', 'journal'])
    writer.writerows(abstracts)

## Cambridge (https://www.cambridge.org)

In [11]:
import requests
from bs4 import BeautifulSoup
import time
import random

def parse_home_page(home_url):
    head = 'https://www.cambridge.org'
    req = session.get(home_url, headers={'User-Agent': ua.random})
    page = req.text
    soup = BeautifulSoup(page, 'html.parser')
    journal = soup.find('div', {'class': 'heading_08'}).text
    volumes = [volume.find_all('a', {'class': 'row'}) for volume in soup.find_all('ul', {'class': 'accordion level fourth'})]
    issue_urls = [head + issue['href'] for volume in volumes for issue in volume]
    return issue_urls, journal

def parse_issue_page(issue_url):
    head = 'https://www.cambridge.org'
    req = session.get(issue_url, headers={'User-Agent': ua.random})
    page = req.text
    soup = BeautifulSoup(page, 'html.parser')
    year = int(soup.find('span', {'class': 'issue-date'}).text.split()[-1])
    articles = soup.find_all('a', {'class': 'part-link'})
    article_urls = [head + article['href'] for article in articles]
    return year, article_urls

def parse_article(article_url, year, journal):
    req = session.get(article_url, headers={'User-Agent': ua.random})
    page = req.text
    soup = BeautifulSoup(page, 'html.parser')
    title = soup.find('div', {'class': 'title'}).text
    authors = soup.find_all('dt', {'class': 'col-12 col-sm-2 title'})
    authors = '; '.join([author.text.strip('*') for author in authors])
    doi = soup.find('div', {'class': 'doi-data'}).find('a')['href']
    abstract = soup.find('meta', {'name': 'citation_abstract'})['content']
    return [doi, authors, title, year, abstract, journal]

def run_all(url):
    issues, journal = parse_home_page(url)
    articles = []
    for i in tqdm(range(len(issues))):
        time.sleep(random.uniform(1.1, 5.2))
        articles.append(parse_issue_page(issues[i]))
    abstracts = []
    for year, issue in tqdm(articles):
        for article in issue:
            time.sleep(random.uniform(1.1, 5.2))
            parsed_article = parse_article(article, year, journal)
            abstracts.append(parsed_article)
    return abstracts

url = 'https://www.cambridge.org/core/journals/language-and-cognition/all-issues'
abstracts = run_all(url)
print(abstracts)

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]



In [12]:
import csv

journal = '-'.join(abstracts[0][5].split())
file_name = f'PR_{journal}.csv'
with open(file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['doi', 'author', 'title', 'year', 'abstract', 'journal'])
    writer.writerows(abstracts)

## JOURNALS UNIVERSITY OF LODZ (https://czasopisma.uni.lodz.pl/)

In [5]:
def parse_home_page(home_url):
  req = session.get(home_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  journal = soup.find('a', {'class': 'is_text'}).text
  issue_urls = [issue['href'] for issue in soup.find_all('a', {'class': 'title'})]

  second_page = home_url + '/2'
  req = session.get(home_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  issue_urls += [issue['href'] for issue in soup.find_all('a', {'class': 'title'})]

  return issue_urls, journal

In [6]:
def parse_issue_page(issue_url):
  req = session.get(issue_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  year = soup.find('h1')
  year = int(year.text.strip()[-5:-1])
  articles = soup.find_all('div', {'class': 'obj_article_summary'})
  article_urls = [article.find('a')['href'] for article in articles]
  return year, article_urls

parse_issue_page('https://czasopisma.uni.lodz.pl/research/issue/view/1449')

(2022,
 ['https://czasopisma.uni.lodz.pl/research/article/view/17824',
  'https://czasopisma.uni.lodz.pl/research/article/view/17825',
  'https://czasopisma.uni.lodz.pl/research/article/view/17826',
  'https://czasopisma.uni.lodz.pl/research/article/view/17827'])

In [7]:
def parse_article(article_url, year, journal):
  head = 'https://czasopisma.uni.lodz.pl/'
  req = session.get(article_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  title = soup.find('h1').text.strip()
  authors = soup.find_all('meta', {'name': 'DC.Creator.PersonalName'})
  authors = '; '.join([author['content'] for author in authors])
  doi = 'https://doi.org/' + soup.find('meta', {'name': 'DC.Identifier.DOI'})['content']
  abstract_text = soup.find('section', {'class': 'item abstract'})
  if not abstract_text:
    print(article_url)
    abstract_text = ''
  else:
    abstract_text = abstract_text.find('p').text
  return [doi, authors, title, year, abstract_text, journal]

# article = 'https://czasopisma.uni.lodz.pl/research/article/view/16340'
article = 'https://czasopisma.uni.lodz.pl/research/article/view/17824'
parse_article(article, 2022, 'Research in Language')

['https://doi.org/10.18778/1731-7533.20.4.01',
 'Thomas Prinzie; Ferran Suñer; Kristel Van Goethem',
 'From Crystal-clear to Limpide: Translating English [Noun+adj] Compound Adjectives with a Figurative-intensifying Noun into French',
 2022,
 'English [Noun+Adj] compound adjectives containing an intensifying metaphor (e.g. crystal-clear) pose particular challenges for French translation, due in part to the absence of a direct equivalent construction. Our study examines morphosyntactic and conceptual-semantic translation procedures that capture how these challenges are resolved. We also explore the little-investigated aspect of translation variation (the number of different solutions for each item). We analyze the potential effects of two factors: the presence or absence of figurative intensification and the items’ frequency of use in English. Our results indicate that translators prefer different morphosyntactic procedures for different compound subtypes. Overall, an adjective constitu

In [8]:
def run_all(url):
  issues, journal = parse_home_page(url)
  articles = []
  for i in tqdm(range(len(issues))):
    time.sleep(random.uniform(5.4, 13.2))
    articles.append(parse_issue_page(issues[i]))
  abstracts = []
  for year, issue in tqdm(articles):
    for article in issue:
      time.sleep(random.uniform(5.1, 11.5))
      parsed_article = parse_article(article, year, journal)
      abstracts.append(parsed_article)
  return abstracts

url = f'https://czasopisma.uni.lodz.pl/research/issue/archive'
abstracts = run_all(url)
abstracts

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

https://czasopisma.uni.lodz.pl/research/article/view/16340
https://czasopisma.uni.lodz.pl/research/article/view/11576
https://czasopisma.uni.lodz.pl/research/article/view/16340
https://czasopisma.uni.lodz.pl/research/article/view/11576


[['https://doi.org/10.18778/1731-7533.20.4.01',
  'Thomas Prinzie; Ferran Suñer; Kristel Van Goethem',
  'From Crystal-clear to Limpide: Translating English [Noun+adj] Compound Adjectives with a Figurative-intensifying Noun into French',
  2022,
  'English [Noun+Adj] compound adjectives containing an intensifying metaphor (e.g. crystal-clear) pose particular challenges for French translation, due in part to the absence of a direct equivalent construction. Our study examines morphosyntactic and conceptual-semantic translation procedures that capture how these challenges are resolved. We also explore the little-investigated aspect of translation variation (the number of different solutions for each item). We analyze the potential effects of two factors: the presence or absence of figurative intensification and the items’ frequency of use in English. Our results indicate that translators prefer different morphosyntactic procedures for different compound subtypes. Overall, an adjective con

In [9]:
import csv

journal = '-'.join(abstracts[0][5].split())
file_name = f'PR_{journal}.csv'
with open(file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['doi', 'author', 'title', 'year', 'abstract', 'journal'])
    writer.writerows(abstracts)

## SCISPACE (https://typeset.io/)

In [5]:
def parse_home_page(home_url):
  head = 'https://typeset.io'
  req = session.get(home_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  journal = soup.find('a', {'data-element': 'journal_name'}).text
  year_urls = [(int(year.text), head + year['href'])  for year in soup.find_all('a', {'class': 'css-lado41'})]
  return year_urls, journal

parse_home_page('https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q')

([(2023,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2023'),
  (2022,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2022'),
  (2021,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2021'),
  (2020,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2020'),
  (2019,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2019'),
  (2018,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2018'),
  (2017,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2017'),
  (2016,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2016'),
  (2015,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2015'),
  (2014,
   'https://typeset.io/journals/international-journal-of-multilingualism-a3f6991q/2014'),
  (2013,
 

In [6]:
def parse_year_page(year_url):
  head = 'https://typeset.io'
  req = session.get(year_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  article_urls = [head + article['href'] for article in soup.find_all('a', {'class': 'css-dmr4uz'})]
  return article_urls

In [7]:
def parse_article(article_url, year, journal):
  head = 'https://typeset.io'
  req = session.get(article_url, headers={'User-Agent': ua.random})
  page = req.text
  soup = BeautifulSoup(page, 'html.parser')
  title = soup.find('meta', {'name': 'citation_title'})['content']
  paper_block = soup.find('div', {'data-section': 'paper_info'})
  authors = paper_block.find('div', {'class': 'author_list'})
  if authors:
    authors = authors.find_all('a')
    authors = '; '.join([author.text for author in authors])
  else:
    print("there aren't authors", article_url)
    authors = ''
  doi = soup.find('meta', {'name': 'DOI'})
  if doi:
    doi = 'https://doi.org/' + doi['content']
  else:
    print("there isn't doi", article_url)
    doi = ''
  abstract_text = soup.find('meta', {'name': 'citation_abstract'})
  if abstract_text:
    abstract_text = abstract_text['content']
  else:
    print(article_url)
    abstract_text = ''
  return [doi, authors, title, year, abstract_text, journal]

parse_article('https://typeset.io/papers/the-musical-language-of-yuen-ren-chao-a-cultural-and-n8b2zguv', 2022, 'Journal of Chinese Linguistics')

['https://doi.org/10.1353/jcl.2022.0003',
 'Ivan Yifan Zou; William S.-Y. Wang',
 'The musical language of Yuen Ren Chao: A cultural and empirical study of the modernization of Chinese music',
 2022,
 'ABSTRACT:In the Chinese art song repertoire, "How can I help but think of you" has long enjoyed enormous prestige since its publication in the 1920s. The song is memorable not only because of its innovative use of language by Liu Bannong in the lyrics but also because of the ingenious arrangement of tonalities, forms, and melodies by Chao Yuen Ren in the music. This essay will be devoted to a cultural and empirical analysis of the song, with the aim of understanding the efforts made by eminent scholars at the early stage of modernization of Chinese music and language. To this end, we will first explore how the lyrics are structured phonetically and syntactically. Tonal complexity and ambiguity in the music will then be analyzed, followed by the discussion of qǐ-chéng-zhuǎn-hé in the musi

In [12]:
def run_all(url):
  years, journal = parse_home_page(url)
  articles = []
  for year, year_url in tqdm(years):
    time.sleep(random.uniform(1.1, 5.2))
    articles.append((year, parse_year_page(year_url)))
  abstracts = []
  for year, issue in tqdm(articles):
    for article in issue:
      time.sleep(random.uniform(1.1, 5.2))
      parsed_article = parse_article(article, year, journal)
      abstracts.append(parsed_article)
  return abstracts

# journal_code = 'international-journal-of-multilingualism-a3f6991q'
# journal_code = 'australian-journal-of-linguistics-1x7u5eek'
journal_code = 'journal-of-chinese-linguistics-2mziql18'
url = f'https://typeset.io/journals/{journal_code}'
abstracts = run_all(url)
abstracts

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

there aren't authors https://typeset.io/papers/the-functional-load-of-chinese-tones-and-the-tonal-evolution-clgv18xb
there aren't authors https://typeset.io/papers/acoustic-correlates-of-prominence-in-kala-lizu-tibeto-burman-1mb6z6zz
there aren't authors https://typeset.io/papers/romanized-transcriptions-of-cantonese-prior-to-robert-36gnd4l2
there aren't authors https://typeset.io/papers/exploring-the-low-applicative-2ia8nzx0
there aren't authors https://typeset.io/papers/various-measures-and-the-distinction-of-tense-and-lax-35gbpixv
there aren't authors https://typeset.io/papers/message-from-the-editors-2hnl9pw8
there aren't authors https://typeset.io/papers/the-voiced-and-released-stop-codas-of-old-chinese-294xk7ga
there aren't authors https://typeset.io/papers/description-and-comparison-habitual-aspect-in-lianjiang-yue-2kvcufin
https://typeset.io/papers/statistical-modeling-of-application-completeness-of-two-tone-1x6t6r63t2
https://typeset.io/papers/reconstruction-and-analysis-of-ph

[['https://doi.org/10.1353/jcl.2023.0006',
  '',
  'The functional load of Chinese tones and the tonal evolution',
  2023,
  'This study calculated the functional load of Chinese tones in different contexts by using an extensive corpus and the Markov model-based "Hockett-Wang algorithm". The results show that, compared to vowels and consonants, tones carry the smallest functional load. In tones that are associated with monosyllabic, bi-syllabic, and multi-syllabic words, the functional load is inversely related to the number of syllables in that word. In other words, the more syllables in a word, the further the functional load of tones tends to decrease. Importantly, this study reveals a close correlation between the functional load of Chinese tones and the evolution of these tones. This is evidenced by the perceived imbalance of the functional load of different tonal contrasts within contemporary Chinese, as well as by the simulation results of the drastically declining functional lo

In [13]:
import csv

journal = '-'.join(abstracts[0][5].split())
file_name = f'PR_{journal}.csv'
with open(file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['doi', 'author', 'title', 'year', 'abstract', 'journal'])
    writer.writerows(abstracts)