In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [3]:
works_by_topic_parquet_folder = "data/works_by_topic_parquet/"
works2references_by_topic_parquet_folder = "data/works2references_by_topic_parquet/"
os.makedirs(works2references_by_topic_parquet_folder, exist_ok=True)

In [4]:
topics = [topic[:-8] for topic in os.listdir(works_by_topic_parquet_folder)]

In [5]:
len(topics)

4516

In [16]:
all_works2primary_topic_df = read_parquet("data/all_works2primary_topic_parquet/all_works2primary_topic.parquet")


Reading 'all_works2primary_topic' from 'data/all_works2primary_topic_parquet/all_works2primary_topic.parquet' using engine='pyarrow'
Read 210,864,615 rows from 'all_works2primary_topic' in 10.08 sec.
Converting dtypes took 0.04 sec. Size before: 10.00GB, after: 10.00GB


Unnamed: 0,id,date,primary_topic
0,W4410793892,2025-05-28,T14064
1,W4410791727,2025-05-27,T14064
2,W4410789294,2025-05-27,T14064


In [17]:
all_works2primary_topic_df.rename(
    columns={"id": "referenced_work_id", "date": "referenced_publication_date", "primary_topic": "referenced_primary_topic"},
    inplace = True
)

In [30]:
def generate_reference_expansion(origin_works_parquet_file_path, all_works2primary_topic_df):
    works_df = read_parquet(origin_works_parquet_file_path, columns = ["id", "date", "references", "primary_topic"], quiet = True)
    
    # Step 1: Explode references are exploded into rows
    works_df = works_df[works_df["references"].apply(lambda x : len(x) > 0)]
    works_df["references"] = works_df["references"].apply(lambda x : x.split(";"))
    works_df = works_df.explode("references").rename(columns={"id": "work_id", "references": "referenced_work_id"})

    # Step 2: Join with metadata of the referenced works
    works_df = works_df.merge(all_works2primary_topic_df, on="referenced_work_id", how="inner")

    # Step 3: Select and reorder the final columns
    works_df = works_df[[
        "work_id", "date", "primary_topic",
        "referenced_work_id", "referenced_publication_date", "referenced_primary_topic"
    ]].rename(columns={"date": "publication_date"})

    return works_df
    
def create_works2references_df(origin_works_parquet_file_path, destination_topic_parquet_file_path, all_works2primary_topic_df, compression='brotli', do_peek = True, do_print = True):
    works2references_df = generate_reference_expansion(origin_works_parquet_file_path, all_works2primary_topic_df)
    works2references_df.to_parquet(destination_topic_parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully generated {destination_topic_parquet_file_path} using {origin_works_parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(destination_topic_parquet_file_path)

In [None]:
for topic in tqdm(topics):
    origin_works_parquet_file_path = works_by_topic_parquet_folder+topic+".parquet"
    destination_topic_parquet_file_path = works2references_by_topic_parquet_folder+topic+".parquet"
    create_works2references_df(origin_works_parquet_file_path, destination_topic_parquet_file_path, all_works2primary_topic_df, do_peek = False, do_print = False)

In [None]:
peek_parquet(destination_topic_parquet_file_path)