In [1]:
import json
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import gzip
import tqdm
import polars as pl
from tqdm import tqdm
from collections import defaultdict
# import pyarrow as pa

def flatten(matrix):
    return [item for row in matrix for item in row]

def simpleId(text):
    try:
        y=text.split('/')[-1]
    except:
        y='NONE'
    return y

def flush_buffer(lines, output_dir):
    df = pl.DataFrame(lines, orient = "row", schema = columns)

    out_path = os.path.join(output_dir, f"authors.csv")
    file_exists = os.path.isfile(out_path)
    # # USING pandas
    # df.write_csv(
    #     out_path,
    #     has_header=not file_exists,
    #     separator=',',
    #     append=file_exists
    # )
    # USING polars
    csv_str = df.write_csv(separator=',', include_header=not file_exists)
    with open(out_path, 'a', encoding='utf-8') as f:
        f.write(csv_str)
    lines = []
    return lines

In [2]:
snapshot_subfolder = "data/openalex-snapshot/data/authors/"

In [3]:
listdir=[subfolder for subfolder in sorted(os.listdir(snapshot_subfolder)) if 'updated' in subfolder]
print(f"Found {len(listdir)} subfolders")

Found 2 subfolders


In [4]:
print(os.listdir(snapshot_subfolder+listdir[0]))

['part_0219.gz', 'part_0116.gz', 'part_0302.gz', 'part_0279.gz', 'part_0201.gz', 'part_0211.gz', 'part_0235.gz', 'part_0215.gz', 'part_0191.gz', 'part_0065.gz', 'part_0172.gz', 'part_0103.gz', 'part_0009.gz', 'part_0052.gz', 'part_0255.gz', 'part_0026.gz', 'part_0227.gz', 'part_0231.gz', 'part_0193.gz', 'part_0264.gz', 'part_0159.gz', 'part_0155.gz', 'part_0095.gz', 'part_0045.gz', 'part_0107.gz', 'part_0237.gz', 'part_0126.gz', 'part_0106.gz', 'part_0229.gz', 'part_0307.gz', 'part_0256.gz', 'part_0016.gz', 'part_0138.gz', 'part_0147.gz', 'part_0290.gz', 'part_0173.gz', 'part_0079.gz', 'part_0068.gz', 'part_0165.gz', 'part_0060.gz', 'part_0177.gz', 'part_0120.gz', 'part_0309.gz', 'part_0294.gz', 'part_0198.gz', 'part_0028.gz', 'part_0241.gz', 'part_0259.gz', 'part_0129.gz', 'part_0166.gz', 'part_0266.gz', 'part_0272.gz', 'part_0301.gz', 'part_0300.gz', 'part_0073.gz', 'part_0282.gz', 'part_0153.gz', 'part_0158.gz', 'part_0130.gz', 'part_0144.gz', 'part_0233.gz', 'part_0143.gz', 'part_0

In [5]:
files=flatten([[snapshot_subfolder+listdir[a]+'/'+i for i in os.listdir(snapshot_subfolder+listdir[a]) if 'part' in i] for a in range(len(listdir))])
print(f"Found {len(files)} files")

Found 413 files


In [6]:
print("Example file:", files[0])

Example file: data/openalex-snapshot/data/authors/updated_date=2026-02-01/part_0219.gz


In [7]:
destination_csv_folder = "data/authors/"
os.makedirs(destination_csv_folder, exist_ok=True)

# before doing any damage, check if there are files in the destination folder and stop if there are, 
# because if there are already csv, it will append and might cause many duplicates
if os.listdir(destination_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(destination_csv_folder)
    raise RuntimeError(
        f"Destination folder '{destination_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

columns=['id', 'display_name','orcid','works_count','h_index','cited_by_count','topics','affiliations','works_count_by_year','cited_by_count_by_year']
lines=[]
for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            try:
                data = json.loads(line)
                if "orcid" not in data or data['orcid'] is None or len(data['orcid']) == 0:
                    orcid = ''
                else:
                    orcid = data['orcid'].split('https://orcid.org/')[-1]
                ll=[simpleId(data['id']),data['display_name'],orcid,int(data['works_count']),
                    int(data['summary_stats']['h_index']),int(data['cited_by_count'])
                    ]
                # topics
                try:
                     ll.append(';'.join([simpleId(i['id'])+'_'+str(i['count']) for i in data['topics']]))
                except:
                     ll.append('')
                # institutions
                try:
                    ll.append(';'.join([simpleId(i['institution']['id'])+'_'+simpleId(i['institution']['country_code'])+'_'+'_'.join([str(j) for j in i['years']]) for i in data['affiliations']]))
                except:
                    # if not available, check if last institution is present (does not have years)
                    try:
                        ll.append(';'.join([simpleId(i['id'])+'_'+i['country_code'] for i in data['last_known_institutions']]))
                    except:
                        try:
                            ll.append(simpleId(data['last_known_institution']['id'])+'_'+data['last_known_institution']['country_code'])
                        except:
                            ll.append('')
                # works_count by year
                ll.append(';'.join([str(i['works_count'])+'_'+str(i['year']) for i in data['counts_by_year'] if i['works_count'] > 0]))
                # cited_by_count by year
                ll.append(';'.join([str(i['cited_by_count'])+'_'+str(i['year']) for i in data['counts_by_year'] if i['cited_by_count'] > 0]))
                
                lines.append(ll)
            except json.JSONDecodeError as e:
                print(f"Errore nel parsing della riga: {e}")
            if len(lines) > 1000:
                lines = flush_buffer(lines, destination_csv_folder)
lines = flush_buffer(lines, destination_csv_folder)

100%|██████████| 413/413 [4:53:21<00:00, 42.62s/it]   
