In [1]:
import json
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import gzip
import tqdm
import polars as pl
from tqdm import tqdm
from collections import defaultdict
# import pyarrow as pa

def flatten(matrix):
    return [item for row in matrix for item in row]

def simpleId(text):
    try:
        y=text.split('/')[-1]
    except:
        y='NONE'
    return y

def rebuild_abstract(word_dict_full): # from the key 'abstract_inverted_index' key per paper
    try:
        word_dict={i:list(word_dict_full[i]) for i in word_dict_full.keys() if word_dict_full[i] is not None }
        max_index = 0
        for word, list_indexes in word_dict.items():
            max_index = max([max_index]+list_indexes)
        # create a list of elements long as the number of total words used
        abstract = [0]*(max_index+1)
        for i in word_dict.keys():
            for j in word_dict[i]:
                abstract[j] = i
        abstract = ' '.join(abstract)
    except:
        abstract = 'NONE'
    return abstract

def flush_buffers(buffers, output_dir, topic_id = None):
    # if topic_id == None, flush all, otherwise only that
    if topic_id != None:
        buffers_to_flush = {topic_id : buffers[topic_id]}
    else:
        buffers_to_flush = buffers
    for topic_id, rows in buffers_to_flush.items():
        # print(f"flushing {topic_id}",flush=True)
        if not rows:
            continue
        df = pl.DataFrame(rows, orient = "row", schema = columns)

        out_path = os.path.join(output_dir, f"{topic_id}.csv")
        file_exists = os.path.isfile(out_path)
        # # USING pandas
        # df.write_csv(
        #     out_path,
        #     has_header=not file_exists,
        #     separator=',',
        #     append=file_exists
        # )
        # USING polars
        csv_str = df.write_csv(separator=';', include_header=not file_exists)
        with open(out_path, 'a', encoding='utf-8') as f:
            f.write(csv_str)
        # flush original buffers
        buffers[topic_id] = []
    return buffers

In [2]:
snapshot_subfolder = "data/openalex-snapshot/data/works/"

In [3]:
listdir=[subfolder for subfolder in sorted(os.listdir(snapshot_subfolder)) if 'updated' in subfolder]
print(f"Found {len(listdir)} subfolders")

Found 424 subfolders


In [4]:
os.listdir(snapshot_subfolder+listdir[0])

['part_000.gz']

In [5]:
files=flatten([[snapshot_subfolder+listdir[a]+'/'+i for i in os.listdir(snapshot_subfolder+listdir[a]) if 'part' in i] for a in range(len(listdir))])
print(f"Found {len(files)} files")

Found 893 files


In [6]:
print("Example file:", files[0])

Example file: data/openalex-snapshot/data/works/updated_date=2023-05-17/part_000.gz


In [None]:
destination_csv_folder = "data/works2text_by_topic_csv/"
os.makedirs(destination_csv_folder, exist_ok=True)

# before doing any damage, check if there are files in the destination folder and stop if there are, 
# because if there are already csv, it will append and might cause many duplicates
if os.listdir(destination_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(destination_csv_folder)
    raise RuntimeError(
        f"Destination folder '{destination_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

columns = ['id', 'title', 'abstract']

# Topic buffers: key = topic_id, value = list of rows (as dicts)
buffers = defaultdict(list)
BUFFER_SIZE = 1000  # flush every N rows per topic

for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            data = json.loads(line)
            if all(k in data for k in ['primary_topic', 'publication_date', 'authorships']):
                if not (data['primary_topic'] and data['publication_date'] and data['authorships']):
                    continue

                row = {}
                primary_topic = simpleId(data['primary_topic']['id'])
                row['id'] = simpleId(data['id'])
                try:
                    row['title'] = data['title'].replace(';','.')
                except:
                    row['title'] = ''
                try:
                    row['abstract'] = rebuild_abstract(data['abstract_inverted_index']).replace(';','.')
                except:
                    row['abstract'] = ''
    

                buffers[primary_topic].append(row)

                # Flush if buffer is too large
                if len(buffers[primary_topic]) >= BUFFER_SIZE:
                    buffers = flush_buffers(buffers, destination_csv_folder, primary_topic)

# Final flush
buffers = flush_buffers(buffers, destination_csv_folder)


 20%|█▉        | 175/893 [00:31<04:07,  2.90it/s] 