In [1]:
import json
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import gzip
import tqdm
import polars as pl
from tqdm import tqdm
from collections import defaultdict
# import pyarrow as pa

def flatten(matrix):
    return [item for row in matrix for item in row]

def simpleId(text):
    try:
        y=text.split('/')[-1]
    except:
        y='NONE'
    return y

def flush_buffer(lines, output_dir):
    df = pl.DataFrame(lines, orient = "row", schema = columns)

    out_path = os.path.join(output_dir, f"funders.csv")
    file_exists = os.path.isfile(out_path)
    # # USING pandas
    # df.write_csv(
    #     out_path,
    #     has_header=not file_exists,
    #     separator=',',
    #     append=file_exists
    # )
    # USING polars
    csv_str = df.write_csv(separator=';', include_header=not file_exists)
    with open(out_path, 'a', encoding='utf-8') as f:
        f.write(csv_str)
    lines = []
    return lines

In [2]:
snapshot_subfolder = "data/openalex-snapshot/data/funders/"

In [3]:
listdir=[subfolder for subfolder in sorted(os.listdir(snapshot_subfolder)) if 'updated' in subfolder]
print(f"Found {len(listdir)} subfolders")

Found 144 subfolders


In [4]:
print(os.listdir(snapshot_subfolder+listdir[0]))

['part_000.gz']


In [5]:
files=flatten([[snapshot_subfolder+listdir[a]+'/'+i for i in os.listdir(snapshot_subfolder+listdir[a]) if 'part' in i] for a in range(len(listdir))])
print(f"Found {len(files)} files")

Found 144 files


In [6]:
print("Example file:", files[0])

Example file: data/openalex-snapshot/data/funders/updated_date=2024-02-07/part_000.gz


In [7]:
destination_csv_folder = "data/funders/"
os.makedirs(destination_csv_folder, exist_ok=True)

# before doing any damage, check if there are files in the destination folder and stop if there are, 
# because if there are already csv, it will append and might cause many duplicates
if os.listdir(destination_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(destination_csv_folder)
    raise RuntimeError(
        f"Destination folder '{destination_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

columns=columns=['id', 'display_name','country_code','grants_count','works_count']
lines=[]
for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            try:
                data = json.loads(line)
                ll=[simpleId(data['id']),data['display_name'].replace(';','.')]
                country_code = data['country_code']
                if country_code is None:
                    country_code = ''
                ll.append(country_code)

                ll.append(data["grants_count"])
                ll.append(data["works_count"])
                    
                lines.append(ll)
            except json.JSONDecodeError as e:
                print(f"Errore nel parsing della riga: {e}")
            if len(lines) > 1000:
                lines = flush_buffer(lines, destination_csv_folder)
lines = flush_buffer(lines, destination_csv_folder)

100%|██████████| 144/144 [00:01<00:00, 74.98it/s] 
