In [1]:
import json
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import gzip
import tqdm
import polars as pl
from tqdm import tqdm
from collections import defaultdict
# import pyarrow as pa

def flatten(matrix):
    return [item for row in matrix for item in row]

def simpleId(text):
    try:
        y=text.split('/')[-1]
    except:
        y='NONE'
    return y

def flush_buffer(lines, output_dir):
    df = pl.DataFrame(lines, orient = "row", schema = columns)

    out_path = os.path.join(output_dir, f"awards.csv")
    file_exists = os.path.isfile(out_path)
    # # USING pandas
    # df.write_csv(
    #     out_path,
    #     has_header=not file_exists,
    #     separator=',',
    #     append=file_exists
    # )
    # USING polars
    csv_str = df.write_csv(separator=';', include_header=not file_exists)
    with open(out_path, 'a', encoding='utf-8') as f:
        f.write(csv_str)
    lines = []
    return lines

In [2]:
snapshot_subfolder = "data/openalex-snapshot/data/awards/"

In [3]:
listdir=[subfolder for subfolder in sorted(os.listdir(snapshot_subfolder)) if 'updated' in subfolder]
print(f"Found {len(listdir)} subfolders")

Found 941 subfolders


In [4]:
print(os.listdir(snapshot_subfolder+listdir[0]))

['part_000.gz']


In [5]:
files=flatten([[snapshot_subfolder+listdir[a]+'/'+i for i in os.listdir(snapshot_subfolder+listdir[a]) if 'part' in i] for a in range(len(listdir))])
print(f"Found {len(files)} files")

Found 943 files


In [6]:
print("Example file:", files[0])

Example file: data/openalex-snapshot/data/awards/updated_date=2022-03-28/part_000.gz


In [7]:
# A check
do_break = False
for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            data = json.loads(line)
            if data["lead_investigator"] is not None and data["lead_investigator"]["orcid"] is not None and len(data["lead_investigator"]["orcid"]) > 0:
                do_break = True
            if do_break == True:
                break
    if do_break == True:
        break
data

  0%|          | 0/943 [00:00<?, ?it/s]


{'id': 'https://openalex.org/G5591213662',
 'display_name': 'Differentiation of NF1(+/-) and NF1(-/-) iPS cells into cells of the Neural Crest-Schwann cell lineage: setting up conditions, characterization of the different differentiation steps and understanding epigenetic status of PNF-derived iPS cells',
 'description': 'To set up in vitro conditions to efficiently differentiate iPS cells into cells of the Neural Crest (NC)-&amp;#173;-Schwann cell (SC) lineage. Characterize the methylome and transcriptome of these cells with distinct NF1 genotypes and test their potential engraftment/tumor formation capacity. Generate new iPSC and edit control iPSC with distinct NF1 genotypes to\ninterrogate the presence of epigenetic memory in the PNF-&amp;#173;-derived iPS cells generated.',
 'funder_award_id': '152010',
 'amount': None,
 'currency': None,
 'funder': {'id': 'https://openalex.org/F4320333709',
  'display_name': 'Neurofibromatosis Therapeutic Acceleration Program',
  'ror_id': None,
 

In [9]:
destination_csv_folder = "data/awards/"
os.makedirs(destination_csv_folder, exist_ok=True)

# before doing any damage, check if there are files in the destination folder and stop if there are, 
# because if there are already csv, it will append and might cause many duplicates
if os.listdir(destination_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(destination_csv_folder)
    raise RuntimeError(
        f"Destination folder '{destination_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

columns=columns=['id', 'display_name', 'description', 'funder_id', 'funding_type', 'funded_outputs_count', 'funded_outputs', 'start_date', 'end_date', 'doi']
# TODO: investigators information does not have IDs and it's not reliable information, so this is not currently added
# it seems also funded_outputs is mostly empty
lines=[]
for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            try:
                data = json.loads(line)
                ll=[simpleId(data['id']),data['display_name'].replace(';','.') if data['display_name'] is not None else "",data['description'].replace(';','.') if data['description'] is not None else "", simpleId(data["funder"]['id']) if data['funder'] is not None else "", data['funding_type']]
                ll.append(data["funded_outputs_count"])
                funded_outputs_str = ""
                for work in data["funded_outputs"]:
                    funded_outputs_str += simpleId(work)
                ll.append(funded_outputs_str)
                ll.append(data["start_date"] if data['start_date'] is not None else "")
                ll.append(data["end_date"] if data['end_date'] is not None else "")
                ll.append(data["doi"] if data['doi'] is not None else "")
                
                lines.append(ll)
            except json.JSONDecodeError as e:
                print(f"Errore nel parsing della riga: {e}")
            if len(lines) > 1000:
                lines = flush_buffer(lines, destination_csv_folder)
lines = flush_buffer(lines, destination_csv_folder)

100%|██████████| 943/943 [06:14<00:00,  2.52it/s] 
