diff --git a/analysis/db/dan/load_data.py b/analysis/db/dan/load_data.py index e4e3e8d..1c079ae 100644 --- a/analysis/db/dan/load_data.py +++ b/analysis/db/dan/load_data.py @@ -11,17 +11,17 @@ def extract_paper_component(paper_json, section_key, text_key="text", line_delim="\n"): section_text = map(lambda x: x[text_key], paper_json[section_key]) text = line_delim.join(section_text) - return(text) + return text def extract_body_text(paper_json, **kwargs): paper_text = extract_paper_component(paper_json, **kwargs) - return(paper_text) + return paper_text def extract_abstract_text(paper_json, **kwargs): abstract_text = extract_paper_component(paper_json, **kwargs) - return(abstract_text) + return abstract_text def extract_paper_data(json_pth, folder_mode): @@ -33,34 +33,32 @@ def extract_paper_data(json_pth, folder_mode): # get the common json sections paper_text = extract_body_text(data, section_key="body_text", text_key="text") - pid = data['paper_id'] - title = data['metadata']['title'] - num_authors = len(data['metadata']['authors']) + pid = data["paper_id"] + title = data["metadata"]["title"] + num_authors = len(data["metadata"]["authors"]) if folder_mode == "pdf_json": - abstract_text = extract_abstract_text(data, section_key="abstract", text_key="text") + abstract_text = extract_abstract_text( + data, section_key="abstract", text_key="text" + ) paper_data = pd.DataFrame( - data = [ - [pid, num_authors, title, abstract_text, paper_text] - ], - columns = ["pid", "num_authors", "title", "abstract", "text"] + data=[[pid, num_authors, title, abstract_text, paper_text]], + columns=["pid", "num_authors", "title", "abstract", "text"], ) - return(paper_data) + return paper_data elif folder_mode == "pmc_json": paper_data = pd.DataFrame( - data = [ - [pid, num_authors, title, paper_text] - ], - columns = ["pid", "num_authors", "title", "text"] + data=[[pid, num_authors, title, paper_text]], + columns=["pid", "num_authors", "title", "text"], ) - return(paper_data) + return paper_data else: raise ValueError(f"Unknown value passed into 'folder_mode': {folder_mode}") - # deletes old datafile that was created, the name was not descriptive enough -if (old := here(f"./data/db/final/kaggle/paper_text/comm_use_subset.tsv", warn=False)): old.unlink(missing_ok=True) +if (old := here(f"./data/db/final/kaggle/paper_text/comm_use_subset.tsv", warn=False)) : + old.unlink(missing_ok=True) # make the folder that data will be saved into pl.Path(here("./data/db/final/kaggle/paper_text/")).mkdir(parents=True, exist_ok=True) @@ -73,7 +71,7 @@ def extract_paper_data(json_pth, folder_mode): # start going through folders and parsing json files into dataframes and tsv files datpb = tqdm([data_sources[1]]) -for dat_source in datpb: # for each data source +for dat_source in datpb: # for each data source datpb.set_description(f"{dat_source}") ds_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/", warn=False) @@ -82,26 +80,35 @@ def extract_paper_data(json_pth, folder_mode): # print(f"\n\n{dat_source}") gppb = tqdm(fdrs) - for gp in gppb: # for each group path - #if gp.name == "pmc_json": continue + for gp in gppb: # for each group path + # if gp.name == "pmc_json": continue gppb.set_description(f"{dat_source}/{gp.name}") - #print(f"\n\n{gp}") + # print(f"\n\n{gp}") - #gp_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/{gp}", warn=False) + # gp_hr = here(f"./data/db/original/kaggle/{dat_source}/{dat_source}/{gp}", warn=False) fs = gp.iterdir() try: fs = list(fs) except FileNotFoundError: - sys.exit(f"Could not find {gp_hr}, did you forget to update the Kaggle dataset?") + sys.exit( + f"Could not find {gp_hr}, did you forget to update the Kaggle dataset?" + ) papers = pd.concat( [extract_paper_data(jsn, folder_mode=gp.name) for jsn in tqdm(fs)] ) - #papers.info() + # papers.info() # writes new data - papers.to_csv(here(f"./data/db/final/kaggle/paper_text/{dat_source}_{gp.name}.tsv", warn=False), - sep="\t", header=True, index=False) + papers.to_csv( + here( + f"./data/db/final/kaggle/paper_text/{dat_source}_{gp.name}.tsv", + warn=False, + ), + sep="\t", + header=True, + index=False, + )