# Crawl Data from NEJM

In [None]:
import sys
import re
import os
import glob
from collections import defaultdict
from copy import deepcopy
import logging
from datetime import datetime
from time import sleep

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [None]:
sys.path.append(".")
from utils.utils import Container

In [None]:
# Replace these with your NEJM credentials
nejm_username = "######"
nejm_password = "######"


# Global variables:
out_dir = "../processed_data/crawler/nejm/urls/"
article_dir = "../processed_data/crawler/nejm/articles/"

traverse = True # Whether to get the article urls.
crawl = True # Whether to get the article content.
os.makedirs(out_dir, exist_ok=True)
os.makedirs(article_dir, exist_ok=True)


In [None]:
#####################
# Utility Functions #
#####################
def print_and_log(message):
	print(message, flush=True)
	logging.info(message)


def detect_dialog_window(driver):

	window = driver.find_elements_by_xpath(\
		"//div[@class='featherlight-content']")
	if window != []:
		return True
	else:
		return False


def close_dialog_window(driver):
	close_button = driver.find_element_by_xpath(
		"//button[@class='featherlight-close-icon featherlight-close']")
	close_button.click()


def nejm_signin(driver):
	driver.get("https://www.nejm.org/")

	xpath_query = "//a[@data-interactiontype='sign_in_click']"
	sign_in_click = driver.find_element_by_xpath(xpath_query)
	sign_in_click.click()

	login = driver.find_element_by_id("login")
	login.send_keys(nejm_username)

	password = driver.find_element_by_id("password")
	password.send_keys(nejm_password)

	driver.find_element_by_id("btnSignIn").click()
	print("Signed in to NEJM.")


def detect_paywall(driver):
	xpath_query = "//a[@class='o-gateway__button o-gateway__button--secondary'" \
				  " and @data-interactiontype='subscribe_click']"

	paywall = driver.find_elements_by_xpath(xpath_query)
	if paywall != []:
		nejm_signin(driver)


def nejm_signout(driver):
	xpath_query = "//a[@data-interactiontype='sign_out_click']"
	driver.find_elements_by_xpath(xpath_query)[0].click()


def yxqy_login(driver):

	driver.get("https://www.nejmqianyan.cn/index.php?c=week&m=year")
	try:
		xpath_query = "//a[@href='javascript:;' and @class='dropdown-toggle']"
		dropdown = driver.find_element_by_xpath(xpath_query)
		dropdown.click()
		membername = driver.find_element_by_name("membername")
		membername.clear()
		membername.send_keys("publicuser")

		password = driver.find_element_by_name("password")
		password.clear()
		password.send_keys("publicuser")

		login = driver.find_element_by_class_name(\
			"btn.btn-default.fastLoginBtn.login-top")
		login.click()
		sleep(2)

	except:
		print("Already logged in to YXQY!")
		logged_in = True


def jw_login(driver):
	try:
		email = driver.find_element_by_xpath(\
			"//article-page//input[@id='email_text']")
		email.clear()
		email.send_keys("bliuforgit@gmail.com")
		password = driver.find_element_by_xpath(\
			"//article-page//input[@id='pwd_text']")
		password.clear()
		password.send_keys("password")
		login = driver.find_element_by_xpath(\
			"//article-page//button")
		login.click()
		sleep(3)

	except:
		print("Already logged into Journal Watch!")
		logged_in = True


In [None]:
######################
# Crawling Functions #
######################
def crawl_zh_page(driver, article_id, zh_url, out_prefix, verbose=False):
	driver.get(zh_url)
	print_and_log(f"Crawling Chinese article: {article_id}.")

	full_article = driver.find_element_by_id("nejm-article-content").text
	full_text = [x.strip() for x in full_article.split("\n")]

	with open(f"{out_prefix}.full.zh", "w") as f:
		for i in full_text:
			f.write(i + "\n")


def crawl_en_page(driver, article_id, en_url, out_prefix, verbose=False):

	driver.get(en_url)
	if detect_dialog_window(driver):
		close_dialog_window(driver)

	# Sign in if paywalled.
	detect_paywall(driver)

	print_and_log(f"Crawling English article: {article_id}.")
	article_type = re.sub("[0-9]+", "", article_id).replace("%", "")
	print_and_log(f"Article type: {article_type}.")

	# Crawl article from NEJM website
	if article_type != "jw.na":
		sleep(1)
		full_article = driver.find_element_by_id("full").text
		full_text = [x.strip() for x in full_article.split("\n")]

		try:
			boxed_text = driver.find_element_by_class_name("m-boxed-text").text
			full_text_no_box = [x.strip() for x in \
				full_article.replace(boxed_text, "").split("\n")]
			print("Found boxed text.")
		except:
			full_text_no_box = full_text
			print("No boxed text.")

	# Crawl article from Journal Watch website
	else:
		try:
			WebDriverWait(driver, timeout=60).\
				until(EC.presence_of_element_located(\
					(By.CLASS_NAME, "article-detail")))
			sleep(1)
		except:
			print("Timeout!")
			return

		jw_login(driver)
		full_article = driver.find_element_by_class_name("article-detail").text
		full_text = [x.strip() for x in full_article.split("\n")]
		full_text_no_box = full_text

	with open(f"{out_prefix}.full.en", "w") as f:
		for i in full_text:
			f.write(i + "\n")

	with open(f"{out_prefix}.nobox.en", "w") as f:
		for i in full_text_no_box:
			f.write(i + "\n")


def compare_zh_and_en(zh_fn, en_fn, epsilon = 2):
	with open(zh_fn, "r") as f_zh, \
		open(en_fn, "r") as f_en:
		zh = f_zh.readlines()
		en = f_en.readlines()

	zh = [x for x in zh if x.strip() != ""]
	en = [x for x in en if x.strip() != ""]
	zh_len, en_len = len(zh), len(en)
	
	if en_len == 0 or zh_len == 0:
		comparison = "empty_article"

	else:
		if en_len / zh_len > epsilon:
			comparison = "en_too_long"
		elif zh_len / en_len > epsilon:
			comparison = "zh_too_long"
		else:
			comparison = "equal"

	return comparison, zh_len, en_len 


def crawl_all_urls(driver, container):

	total = len([_ for i in container.values() \
		for j in i.values() for k in j.values()])

	n = 0
	for year, month_dict in container.items():
		
		print_and_log("#############")
		print_and_log(f"# Year {year} #")
		print_and_log("#############")

		for month, article_dict in month_dict.items():
			os.makedirs(os.path.join(article_dir, year, month), exist_ok=True)

			print_and_log("####################")
			print_and_log(f"# Crawling {year}/{month} #")
			print_and_log("####################")

			for article_id, (zh_title, en_title, zh_url, en_url) in article_dict.items():

				if n % 100 == 0:
					message = f"### Progress: {n}/{total} Articles ###"
					print_and_log(message)

				message = f"Article: {zh_title}/{en_title}"
				print_and_log(message)

				out_prefix = f"{article_dir}/{year}/{month}/{article_id}"
				zh_out = f"{out_prefix}.full.zh"
				en_out = f"{out_prefix}.nobox.en"

				# Crawl articles:
				if not os.path.exists(zh_out):
					crawl_zh_page(driver, article_id, zh_url, out_prefix)
				if not os.path.exists(en_out):
					crawl_en_page(driver, article_id, en_url, out_prefix)

				n += 1

In [None]:
container = Container()
container.read_from_disk(out_dir)

# Traversing the NEJM website.
if traverse:
  container.traverse(driver, out_dir)

# Logging:
if crawl:
  log_fn = "{}/article.log".format(article_dir)
  logging.basicConfig(filename=log_fn, \
    format="%(message)s", level=logging.DEBUG)
  crawl_all_urls(driver, container)

nejm_signout(driver)

# Preprocess data

In [None]:
import re
import sys
sys.path.append(".")
from utils.utils import read_article_urls

url_dir = "../processed_data/crawler/nejm/urls/"
article_dir = "../processed_data/crawler/nejm/articles/"

def read_article(fn):
	with open(fn, "r") as f: 
		x = f.readlines()
	return x


In [None]:
# Stitch two or more sentences into one
# Why? Because sometime a single sentence 
# are broken into muliple piece on the website.
def stitch(article, lang):
	if lang == "zh":
		for i, _ in enumerate(article):
			# A line with only numbers (i.e citation)
			if re.fullmatch("^[0-9,-]+\n$", article[i]):
				if article[i].endswith("\n"):
					article[i] = article[i].replace("\n", "")
				if article[i-1].endswith("\n"):
					article[i-1] = article[i-1].replace("\n", "")
			# A line with open a period
			if re.fullmatch("^。$", article[i]):
				if article[i-1].endswith("\n"):
					article[i-1] = article[i-1].replace("\n", "")

	elif lang == "en":
		for i, _ in enumerate(article):
			# A line with a hyperlink
			if article[i].strip() == ". opens in new tab":
				article[i] = ""
				if article[i-1].endswith("\n"):
					article[i-1] = article[i-1].replace("\n", "")

	full_text = "".join(article)
	article = full_text.split("\n")
	return article


def filter(article, article_type, lang):
	keep = [True] * len(article)

	# Remove correspondence and 
	# image in clinical medicine
	if article_type in ["c", "icm"]: 
		return [] 

	if lang == "zh":
		for i, text in enumerate(article):

			#############################
			# Remove text in the middle #
			#############################
			# Remove tables and figures
			if re.match("图[0-9]{1,2}\.", text):
				keep[i] = keep[i+1] = False
			elif re.match("表[0-9]{1,2}\.", text):
				keep[i] = False

			# Remove table captions
			elif text.startswith("*") or \
				text.startswith("†") or \
				text.startswith("‡") or \
				text.startswith("§") or \
				text.startswith("¶") or \
				text.startswith("‖") or \
				text.startswith("|"):
				keep[i] = False

			# Remove empty lines
			elif text.strip() == "":
				keep[i] = False

			######################
			# Remove text before #
			######################
			if article_type == "clde":
				if text.strip() == "案例摘要" or \
					text.strip() == "病例摘要":
					for j in range(i):
						keep[j] = False

			#####################
			# Remove text after #
			#####################
			if article_type == "jw.na": # Journal Watch
				if text.startswith("出版时") or \
					text.startswith("引文"):
					for j in range(i, len(keep)):
						keep[j] = False
					break

			# Original Article
			# Review Article
			# Case Records
			# Perspective
			# Editorial
			# Clinical Problem solving
			# Clinical Implications of Basic Research
			# Special report
			# Special article
			# Clinical therapeutics
			# Health policy report
			# Clinical practice
			# Medicine and Society
			elif article_type in ["oa", "ra", "cpc", "p", "ms",\
				"e", "cps", "cibr", "sr", "sa", "ct", "hpr", "cp"] : 
				if text.startswith("Disclosure") or \
					text.startswith("译者") or \
					text.startswith("作者信息"):
					for j in range(i, len(keep)):
						keep[j] = False
					break
			# Corrections
			elif article_type == "x": 
				if text.startswith("译者"):
					for j in range(i, len(keep)):
						keep[j] = False
					break
			# Clinical Decisions
			elif article_type == "clde":
				if text.startswith("选项2"):
					for j in range(i, len(keep)):
						keep[j] = False
					break



	elif lang == "en":
		for i, text in enumerate(article):

			#############################
			# Remove text in the middle #
			#############################
			# Remove Table and Figure
			if re.match("Table [0-9]{1,2}\.", text) or \
				re.match("Figure [0-9]{1,2}\.", text):
				keep[i] = keep[i+1] = False
			# Remove video and audio interviews:
			elif text.strip() == "Video" or \
				text.strip() == "Interactive Graphic":
				keep[i] = keep[i+1] = False
			# Audio interview:
			elif text.strip() == "Audio Interview":
				keep[i] = keep[i+1] = keep[i+2] = False
			# Remove QUICK TAKE:
			elif text.strip() == "QUICK TAKE":
				keep[i] = keep[i+1] = keep[i+2] = keep[i+3] = keep[i+4] = False
			# Remove VISUAL ABSTRACT:
			elif text.strip() == "VISUAL ABSTRACT":
				keep[i] = keep[i+1] = keep[i+2] = False
			# Remove intro and other text:
			elif text.strip() == "Letters" or \
				text.strip() == "Download" or \
				text.strip() == "Audio Full Text" or \
				text.strip() == "Key Clinical Points" or \
				text.strip() == "Poll" or \
				text.startswith("Comments open through") or \
				text.startswith("Citing Article") or \
				text.startswith("Option 1") or \
				text.startswith("Option 2") or \
				re.match("^[0-9]+ Reference", text) or \
				re.match("^[0-9]+ Citing Article", text) or \
				re.match("^[0-9]+ Comment", text):
				keep[i] = False
			# Remove sign-ups
			elif text.startswith("Sign up for"):
				keep[i] = False
			elif text.strip() == "":
				keep[i] = False

			######################
			# Remove text before #
			######################
			if article_type == "jw.na":
				for j in range(5): # Remove first 5 lines
					keep[j] = False
			elif article_type == "oa": # Original Article
				if text.strip() == "Abstract":
					for j in range(i):
						keep[j] = False
			elif article_type == "cpc": # Case Records
				if text.strip() == "Presentation of Case":
					for j in range(i):
						keep[j] = False

			#####################
			# Remove text after #
			#####################
			if article_type == "jw.na":
				if text.startswith("EDITOR DISCLOSURES AT TIME OF PUBLICATION") or \
					text.startswith("CITATION"):
					for j in range(i, len(keep)):
						keep[j] = False
					break

			# Original Article
			# Review Article
			# Case Records
			# Perspective
			# Editorial
			# Clinical Problem Solving
			# Clinical Implications of Basic Research
			# Special report
			# Clinical decision
			# Special article
			# Clinical Therapeutics
			# Health policy report
			# Clinical Practice
			# Medicine and Society
			elif article_type in ["oa", "ra", "cpc", "p", "e", "ms",\
				"cps", "cibr", "sr", "clde", "sa", "ct", "hpr", "cp"]: 
				if text.startswith("Disclosure"):
					for j in range(i, len(keep)):
						keep[j] = False
					break

	# Output to disk
	article_filt = []
	for a, k in zip(article, keep):
		if k == True:
			article_filt.append(a)

	return article_filt


In [None]:
meta = read_article_urls(url_dir)
meta = meta[meta["year"] != 2020] # Remove year 2020

for index, row in meta.iterrows():
year = row["year"]
month = row["month"]
article_id = row["id"]
article_type = re.sub("[0-9%]+", "", article_id)



zh_fn = f"{article_dir}/{year}/{month:02}/{article_id}.full.zh"
en_fn = f"{article_dir}/{year}/{month:02}/{article_id}.nobox.en"

print(f"path: {zh_fn}")
zh_article = read_article(zh_fn)
zh_article = stitch(zh_article, "zh")
zh_article = filter(zh_article, article_type, "zh")


print(f"path: {en_fn}")
en_article = read_article(en_fn)
en_article = stitch(en_article, "en")
en_article = filter(en_article, article_type, "en")

intersect = set(zh_article).intersection(set(en_article))

zh_out_fn = zh_fn.replace(".full.", ".filt.")
with open(zh_out_fn, "w") as f: 
  for line in zh_article:
    if line not in intersect:
      f.write(line + "\n")

en_out_fn = en_fn.replace(".nobox.",".filt.")
with open(en_out_fn, "w") as f:
  for line in en_article:
    if line not in intersect:
      f.write(line + "\n")


# Normalize and Split paragraphs into sentences

In [None]:
!normalize.sh

In [None]:
!eserix.sh

# Align sentences

In [None]:
!alignment/moore/input.sh
!alignment/moore/align.sh

In [None]:
!clean/concat.sh
!clean/clean.sh
# Split the data into train, dev and test:
# Run the following to split data into train (~ 93000), development (~ 2000), and test (~ 2000):
!split_data/split_train_test.py