In [2]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [4]:
works_by_topic_parquet_folder = "data/works_by_topic_parquet/"
works2references_by_topic_parquet_folder = "data/works2references_by_topic_parquet/"
works2citations_by_topic_csv_folder = "data/works2citations_by_topic_csv/"
os.makedirs(works2citations_by_topic_csv_folder, exist_ok=True)

In [5]:
def flush_buffers(buffers, output_dir, topic_id=None):
    if topic_id is not None:
        buffers_to_flush = {topic_id: buffers[topic_id]}
    else:
        buffers_to_flush = buffers

    for topic_id, rows in buffers_to_flush.items():
        if not rows:
            continue
        df = pl.DataFrame(rows, orient="row", schema=columns)
        out_path = os.path.join(output_dir, f"{topic_id}.csv")
        file_exists = os.path.isfile(out_path)
        csv_str = df.write_csv(separator=DELIMITER, include_header=not file_exists)
        with open(out_path, 'a', encoding='utf-8') as f:
            f.write(csv_str)
        buffers[topic_id] = []

    return buffers

In [7]:
num_topics = len(os.listdir(works_by_topic_parquet_folder))
num_topics

4516

In [9]:
topics = [topic[:-8] for topic in os.listdir(works2references_by_topic_parquet_folder)]

print("currently we have", len(topics), "topics processed for references")

currently we have 109 topics processed for references


In [10]:
BUFFER_SIZE = 5000
DELIMITER = ','

# OUTPUT SCHEMA
columns = [
    "work_id", "primary_topic", "publication_date",
    "referenced_work_id", "referenced_primary_topic", "referenced_publication_date"
]

# before doing any damage, check if there are files in the destination folder and stop if there are, 
# because if there are already csv, it will append and might cause many duplicates
if os.listdir(works2citations_by_topic_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(works2citations_by_topic_csv_folder)
    raise RuntimeError(
        f"Destination folder '{works2citations_by_topic_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

# Buffers: cited_topic â†’ list of dicts
buffers = {}

processed_topics = []
# Process each topic-level reference file
while len(processed_topics) < num_topics: # this while is useful if references are being processed at the same time as citations
    for topic in tqdm(topics):
        if topic not in processed_topics:
            file = works2references_by_topic_parquet_folder+topic+".parquet"
            df = pl.read_parquet(file)
        
            for row in df.iter_rows(named=True):
                work_id = row["work_id"]
                primary_topic = row["primary_topic"]
                publication_date = row["publication_date"]
                referenced_work_id = row["referenced_work_id"]
                referenced_primary_topic = row["referenced_primary_topic"]
                referenced_publication_date = row["referenced_publication_date"]
        
                # Skip if reference target is unknown
                if not referenced_primary_topic or not primary_topic:
                    continue
        
                rec = [work_id,primary_topic,publication_date,referenced_work_id,referenced_primary_topic,referenced_publication_date]
        
                buffers.setdefault(referenced_primary_topic, []).append(rec)
        
                if len(buffers[referenced_primary_topic]) >= BUFFER_SIZE:
                    flush_buffers(buffers, output_dir = works2citations_by_topic_csv_folder, topic_id=referenced_primary_topic)
            processed_topics.append(topic)
    topics = [topic[:-8] for topic in os.listdir(works2references_by_topic_parquet_folder)]
    print("currently we have", len(topics), "topics processed for references")
# Final flush
buffers = flush_buffers(buffers, output_dir = works2citations_by_topic_csv_folder)


  0%|          | 0/109 [00:00<?, ?it/s]

currently we have 357 topics processed for references


  0%|          | 0/357 [00:00<?, ?it/s]

currently we have 1004 topics processed for references


  0%|          | 0/1004 [00:00<?, ?it/s]

currently we have 2535 topics processed for references


  0%|          | 0/2535 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



currently we have 4516 topics processed for references
