In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [3]:
works2references_by_topic_parquet_folder = "data/works2references_by_topic_parquet/"
works2citations_by_topic_csv_folder = "data/works2citations_by_topic_csv/"
os.makedirs(works2citations_by_topic_csv_folder, exist_ok=True)

In [5]:
topics = [topic[:-8] for topic in os.listdir(works2references_by_topic_parquet_folder)]

In [6]:
len(topics)

4516

In [7]:
def flush_buffers(buffers, output_dir, topic_id=None):
    if topic_id is not None:
        buffers_to_flush = {topic_id: buffers[topic_id]}
    else:
        buffers_to_flush = buffers

    for topic_id, rows in buffers_to_flush.items():
        if not rows:
            continue
        df = pl.DataFrame(rows, orient="row", schema=columns)
        out_path = os.path.join(output_dir, f"{topic_id}.csv")
        file_exists = os.path.isfile(out_path)
        csv_str = df.write_csv(separator=DELIMITER, include_header=not file_exists)
        with open(out_path, 'a', encoding='utf-8') as f:
            f.write(csv_str)
        buffers[topic_id] = []

    return buffers

In [8]:
BUFFER_SIZE = 5000
DELIMITER = ','

# OUTPUT SCHEMA
columns = [
    "work_id", "primary_topic", "publication_date",
    "referenced_work_id", "referenced_primary_topic", "referenced_publication_date"
]

# before doing any damage, check if there are files in the destination folder and stop if there are, 
# because if there are already csv, it will append and might cause many duplicates
if os.listdir(works2citations_by_topic_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(works2citations_by_topic_csv_folder)
    raise RuntimeError(
        f"Destination folder '{works2citations_by_topic_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

# Buffers: cited_topic → list of dicts
buffers = {}

# Process each topic-level reference file

for topic in tqdm(topics):
    file = works2references_by_topic_parquet_folder+topic+".parquet"
    df = pl.read_parquet(file)

    for row in df.iter_rows(named=True):
        work_id = row["work_id"]
        primary_topic = row["primary_topic"]
        publication_date = row["publication_date"]
        referenced_work_id = row["referenced_work_id"]
        referenced_primary_topic = row["referenced_primary_topic"]
        referenced_publication_date = row["referenced_publication_date"]

        # Skip if reference target is unknown
        if not referenced_primary_topic or not primary_topic:
            continue

        rec = [work_id,primary_topic,publication_date,referenced_work_id,referenced_primary_topic,referenced_publication_date]

        buffers.setdefault(referenced_primary_topic, []).append(rec)

        if len(buffers[referenced_primary_topic]) >= BUFFER_SIZE:
            flush_buffers(buffers, output_dir = works2citations_by_topic_csv_folder, topic_id=referenced_primary_topic)

# Final flush
buffers = flush_buffers(buffers, output_dir = works2citations_by_topic_csv_folder)


100%|██████████| 4516/4516 [7:22:09<00:00,  5.87s/it]   


{'T10809': [],
 'T11058': [],
 'T12393': [],
 'T13129': [],
 'T10164': [],
 'T11454': [],
 'T10088': [],
 'T10415': [],
 'T10006': [],
 'T13033': [],
 'T12405': [],
 'T13283': [],
 'T10154': [],
 'T11536': [],
 'T11641': [],
 'T10880': [],
 'T10068': [],
 'T10467': [],
 'T11400': [],
 'T11994': [],
 'T13560': [],
 'T14064': [],
 'T10826': [],
 'T11539': [],
 'T10008': [],
 'T11550': [],
 'T11239': [],
 'T11959': [],
 'T10722': [],
 'T10845': [],
 'T11912': [],
 'T11161': [],
 'T10213': [],
 'T10726': [],
 'T12787': [],
 'T12415': [],
 'T11722': [],
 'T14309': [],
 'T10438': [],
 'T12922': [],
 'T11631': [],
 'T13353': [],
 'T11514': [],
 'T13812': [],
 'T11939': [],
 'T12028': [],
 'T11079': [],
 'T10115': [],
 'T10475': [],
 'T13748': [],
 'T10646': [],
 'T10003': [],
 'T10058': [],
 'T11843': [],
 'T11779': [],
 'T11466': [],
 'T12997': [],
 'T10970': [],
 'T12671': [],
 'T11806': [],
 'T14290': [],
 'T12378': [],
 'T14260': [],
 'T13419': [],
 'T10395': [],
 'T11715': [],
 'T11526':