In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

In [7]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [2]:
works_by_topic_parquet_folder = "data/works_by_topic_parquet/"
works2references_by_topic_parquet_folder = "data/works2references_by_topic_parquet/"
os.makedirs(works2references_by_topic_parquet_folder, exist_ok=True)

In [3]:
topics = [topic[:-8] for topic in os.listdir(works_by_topic_parquet_folder)]

In [4]:
len(topics)

4516

In [8]:
all_works2primary_topic_df = read_parquet("data/all_works2primary_topic_parquet/all_works2primary_topic.parquet")


Reading 'all_works2primary_topic' from 'data/all_works2primary_topic_parquet/all_works2primary_topic.parquet' using engine='pyarrow'
Read 306,133,392 rows from 'all_works2primary_topic' in 33.00 sec.
Converting dtypes took 0.04 sec. Size before: 14.52GB, after: 14.52GB


Unnamed: 0,id,date,primary_topic
0,W7126435799,2026-02-01,T14064
1,W7126372280,2026-02-01,T14064
2,W7126447841,2026-01-31,T14064


In [9]:
all_works2primary_topic_df.rename(
    columns={"id": "referenced_work_id", "date": "referenced_publication_date", "primary_topic": "referenced_primary_topic"},
    inplace = True
)

In [16]:
def generate_reference_expansion(origin_works_parquet_file_path, all_works2primary_topic_df):
    works_df = read_parquet(origin_works_parquet_file_path, columns = ["id", "date", "references", "primary_topic"], quiet = True)
    
    # Step 1: Explode references are exploded into rows
    works_df = works_df[works_df["references"].apply(lambda x : len(x) > 0)]
    works_df["references"] = works_df["references"].apply(lambda x : x.split(";"))
    works_df = works_df.explode("references").rename(columns={"id": "work_id", "references": "referenced_work_id"})

    # Step 2: Join with metadata of the referenced works
    works_df = works_df.merge(all_works2primary_topic_df, on="referenced_work_id", how="inner")

    # Step 3: Select and reorder the final columns
    works_df = works_df[[
        "work_id", "date", "primary_topic",
        "referenced_work_id", "referenced_publication_date", "referenced_primary_topic"
    ]].rename(columns={"date": "publication_date"})

    return works_df
    
def create_works2references_df(origin_works_parquet_file_path, destination_topic_parquet_file_path, all_works2primary_topic_df, compression='brotli', do_peek = True, do_print = True):
    works2references_df = generate_reference_expansion(origin_works_parquet_file_path, all_works2primary_topic_df)
    works2references_df.to_parquet(destination_topic_parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully generated {destination_topic_parquet_file_path} using {origin_works_parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(destination_topic_parquet_file_path)

In [15]:
for topic in tqdm(topics):
    origin_works_parquet_file_path = works_by_topic_parquet_folder+topic+".parquet"
    destination_topic_parquet_file_path = works2references_by_topic_parquet_folder+topic+".parquet"
    create_works2references_df(origin_works_parquet_file_path, destination_topic_parquet_file_path, all_works2primary_topic_df, do_peek = False, do_print = False)

  0%|          | 0/4516 [00:00<?, ?it/s]

In [12]:
peek_parquet(destination_topic_parquet_file_path)

Name: 'T10091'
Path: 'data/works2references_by_topic_parquet/T10091.parquet'
Files: 1
Rows: 2,415,324
Schema:
    work_id: large_string
    publication_date: large_string
    primary_topic: large_string
    referenced_work_id: large_string
    referenced_publication_date: large_string
    referenced_primary_topic: large_string
    -- schema metadata --
    pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1069
5 random rows:


Unnamed: 0,work_id,publication_date,primary_topic,referenced_work_id,referenced_publication_date,referenced_primary_topic
0,W7126386361,2026-02-01,T10091,W2043990618,2007-03-20,T10206
1,W7126386361,2026-02-01,T10091,W2161374186,2015-01-01,T10206
2,W7126386361,2026-02-01,T10091,W2199236045,2016-01-04,T10091
3,W7126386361,2026-02-01,T10091,W2299228959,2016-03-17,T10166
4,W7126386361,2026-02-01,T10091,W2507959777,2016-09-02,T10091
