Skip to content

Commit

Permalink
Merge pull request #24 from databridgevt/parse_pdf_pmc
Browse files Browse the repository at this point in the history
get text extration working for pdf AND pmc
  • Loading branch information
chendaniely committed Apr 9, 2020
2 parents 6223126 + fa72b8a commit a206b09
Showing 1 changed file with 85 additions and 27 deletions.
112 changes: 85 additions & 27 deletions analysis/db/dan/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,108 @@

from pyprojroot import here


def extract_paper_component(paper_json, section_key, text_key="text", line_delim="\n"):
section_text = map(lambda x: x[text_key], paper_json[section_key])
text = line_delim.join(section_text)
return(text)
return text


def extract_body_text(paper_json, **kwargs):
paper_text = extract_paper_component(paper_json, **kwargs)
return(paper_text)
return paper_text


def extract_abstract_text(paper_json, **kwargs):
abstract_text = extract_paper_component(paper_json, **kwargs)
return(abstract_text)
return abstract_text


def extract_paper_data(json_pth):
def extract_paper_data(json_pth, folder_mode):
with open(json_pth) as f:
data = json.load(f)
# pdf_json has an abstract section
# pmc_json does not have an abstract section
# the rest appear to be the same

# get the common json sections
paper_text = extract_body_text(data, section_key="body_text", text_key="text")
abstract_text = extract_abstract_text(data, section_key="abstract", text_key="text")
pid = data['paper_id']
title = data['metadata']['title']
num_authors = len(data['metadata']['authors'])
pid = data["paper_id"]
title = data["metadata"]["title"]
num_authors = len(data["metadata"]["authors"])

paper_data = pd.DataFrame(
data = [
[pid, num_authors, title, abstract_text, paper_text]
],
columns = ["pid", "num_authors", "title", "abstract", "text"]
)
return(paper_data)
if folder_mode == "pdf_json":
abstract_text = extract_abstract_text(
data, section_key="abstract", text_key="text"
)
paper_data = pd.DataFrame(
data=[[pid, num_authors, title, abstract_text, paper_text]],
columns=["pid", "num_authors", "title", "abstract", "text"],
)
return paper_data
elif folder_mode == "pmc_json":
paper_data = pd.DataFrame(
data=[[pid, num_authors, title, paper_text]],
columns=["pid", "num_authors", "title", "text"],
)
return paper_data
else:
raise ValueError(f"Unknown value passed into 'folder_mode': {folder_mode}")

hr = here("./data/db/original/kaggle/comm_use_subset/comm_use_subset/pdf_json/")
fs = hr.iterdir()

try:
fs = list(fs)
except FileNotFoundError:
sys.exit(f"Could not find {hr}, did you forget to update the Kaggle dataset?")
# deletes old datafile that was created, the name was not descriptive enough
if (old := here(f"./data/db/final/kaggle/paper_text/comm_use_subset.tsv", warn=False)) :
old.unlink(missing_ok=True)

papers = pd.concat(
[extract_paper_data(jsn) for jsn in tqdm(fs)]
)
# make the folder that data will be saved into
pl.Path(here("./data/db/final/kaggle/paper_text/")).mkdir(parents=True, exist_ok=True)

papers.info()
data_sources = [
"biorxiv_medrxiv",
"comm_use_subset",
"noncomm_use_subset",
]

pl.Path(here("./data/db/final/kaggle/paper_text/")).mkdir(parents=True, exist_ok=True)
papers.to_csv(here("./data/db/final/kaggle/paper_text/comm_use_subset.tsv"), sep="\t", header=True, index=False)
# start going through folders and parsing json files into dataframes and tsv files
datpb = tqdm([data_sources[1]])
for dat_source in datpb: # for each data source
datpb.set_description(f"{dat_source}")

ds_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/", warn=False)
fdrs = list(ds_hr.iterdir())

# print(f"\n\n{dat_source}")

gppb = tqdm(fdrs)
for gp in gppb: # for each group path
# if gp.name == "pmc_json": continue
gppb.set_description(f"{dat_source}/{gp.name}")

# print(f"\n\n{gp}")

# gp_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/{gp}", warn=False)
fs = gp.iterdir()

try:
fs = list(fs)
except FileNotFoundError:
sys.exit(
f"Could not find {gp_hr}, did you forget to update the Kaggle dataset?"
)

papers = pd.concat(
[extract_paper_data(jsn, folder_mode=gp.name) for jsn in tqdm(fs)]
)

# papers.info()

# writes new data
papers.to_csv(
here(
f"./data/db/final/kaggle/paper_text/{dat_source}_{gp.name}.tsv",
warn=False,
),
sep="\t",
header=True,
index=False,
)

0 comments on commit a206b09

Please sign in to comment.