Skip to content

Commit

Permalink
BLACK!
Browse files Browse the repository at this point in the history
  • Loading branch information
chendaniely committed Apr 9, 2020
1 parent 1c7fecd commit fa72b8a
Showing 1 changed file with 35 additions and 28 deletions.
63 changes: 35 additions & 28 deletions analysis/db/dan/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
def extract_paper_component(paper_json, section_key, text_key="text", line_delim="\n"):
section_text = map(lambda x: x[text_key], paper_json[section_key])
text = line_delim.join(section_text)
return(text)
return text


def extract_body_text(paper_json, **kwargs):
paper_text = extract_paper_component(paper_json, **kwargs)
return(paper_text)
return paper_text


def extract_abstract_text(paper_json, **kwargs):
abstract_text = extract_paper_component(paper_json, **kwargs)
return(abstract_text)
return abstract_text


def extract_paper_data(json_pth, folder_mode):
Expand All @@ -33,34 +33,32 @@ def extract_paper_data(json_pth, folder_mode):

# get the common json sections
paper_text = extract_body_text(data, section_key="body_text", text_key="text")
pid = data['paper_id']
title = data['metadata']['title']
num_authors = len(data['metadata']['authors'])
pid = data["paper_id"]
title = data["metadata"]["title"]
num_authors = len(data["metadata"]["authors"])

if folder_mode == "pdf_json":
abstract_text = extract_abstract_text(data, section_key="abstract", text_key="text")
abstract_text = extract_abstract_text(
data, section_key="abstract", text_key="text"
)
paper_data = pd.DataFrame(
data = [
[pid, num_authors, title, abstract_text, paper_text]
],
columns = ["pid", "num_authors", "title", "abstract", "text"]
data=[[pid, num_authors, title, abstract_text, paper_text]],
columns=["pid", "num_authors", "title", "abstract", "text"],
)
return(paper_data)
return paper_data
elif folder_mode == "pmc_json":
paper_data = pd.DataFrame(
data = [
[pid, num_authors, title, paper_text]
],
columns = ["pid", "num_authors", "title", "text"]
data=[[pid, num_authors, title, paper_text]],
columns=["pid", "num_authors", "title", "text"],
)
return(paper_data)
return paper_data
else:
raise ValueError(f"Unknown value passed into 'folder_mode': {folder_mode}")



# deletes old datafile that was created, the name was not descriptive enough
if (old := here(f"./data/db/final/kaggle/paper_text/comm_use_subset.tsv", warn=False)): old.unlink(missing_ok=True)
if (old := here(f"./data/db/final/kaggle/paper_text/comm_use_subset.tsv", warn=False)) :
old.unlink(missing_ok=True)

# make the folder that data will be saved into
pl.Path(here("./data/db/final/kaggle/paper_text/")).mkdir(parents=True, exist_ok=True)
Expand All @@ -73,7 +71,7 @@ def extract_paper_data(json_pth, folder_mode):

# start going through folders and parsing json files into dataframes and tsv files
datpb = tqdm([data_sources[1]])
for dat_source in datpb: # for each data source
for dat_source in datpb: # for each data source
datpb.set_description(f"{dat_source}")

ds_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/", warn=False)
Expand All @@ -82,26 +80,35 @@ def extract_paper_data(json_pth, folder_mode):
# print(f"\n\n{dat_source}")

gppb = tqdm(fdrs)
for gp in gppb: # for each group path
#if gp.name == "pmc_json": continue
for gp in gppb: # for each group path
# if gp.name == "pmc_json": continue
gppb.set_description(f"{dat_source}/{gp.name}")

#print(f"\n\n{gp}")
# print(f"\n\n{gp}")

#gp_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/{gp}", warn=False)
# gp_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/{gp}", warn=False)
fs = gp.iterdir()

try:
fs = list(fs)
except FileNotFoundError:
sys.exit(f"Could not find {gp_hr}, did you forget to update the Kaggle dataset?")
sys.exit(
f"Could not find {gp_hr}, did you forget to update the Kaggle dataset?"
)

papers = pd.concat(
[extract_paper_data(jsn, folder_mode=gp.name) for jsn in tqdm(fs)]
)

#papers.info()
# papers.info()

# writes new data
papers.to_csv(here(f"./data/db/final/kaggle/paper_text/{dat_source}_{gp.name}.tsv", warn=False),
sep="\t", header=True, index=False)
papers.to_csv(
here(
f"./data/db/final/kaggle/paper_text/{dat_source}_{gp.name}.tsv",
warn=False,
),
sep="\t",
header=True,
index=False,
)

0 comments on commit fa72b8a

Please sign in to comment.