In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [3]:
works_by_topic_parquet_folder = "data/works_by_topic_parquet/"
author2work_by_topic_parquet_folder = "data/author2work_by_topic_parquet/"
os.makedirs(author2work_by_topic_parquet_folder, exist_ok=True)

In [4]:
topics = [topic[:-8] for topic in os.listdir(works_by_topic_parquet_folder)]

In [5]:
len(topics)

4516

In [6]:
peek_parquet(works_by_topic_parquet_folder+topics[0]+".parquet")

Name: 'T14064'
Path: 'data/works_by_topic_parquet/T14064.parquet'
Files: 1
Rows: 22,138
Schema:
    id: large_string
    date: large_string
    type: large_string
    language: large_string
    journal: large_string
    doi: large_string
    authors: large_string
    topics: large_string
    references: large_string
    sdg: large_string
    keywords: large_string
    grants: large_string
    primary_topic: large_string
5 random rows:


Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,grants,primary_topic
0,W4410793892,2025-05-28,article,en,S4387290976,10.60036/jbm.607,A5099299656__F;A5066899211__F;A5083022923__F,T14064_0.8632;T12028_0.769;T14356_0.7352,,,,,T14064
1,W4410791727,2025-05-27,article,en,S4210237825,10.1108/ijoes-08-2024-0267,A5117718680__F;A5077408226__F,T14064_0.9906;T14356_0.9839;T14436_0.9804,W1979538122;W2010295782;W2014049235;W202817926...,,Ethical Leadership_0.83126736,,T14064
2,W4410789294,2025-05-27,article,en,S4210190284,10.1002/cnma.202400645,A5100328291_I1308199089_F;A5068787248_I4210091...,T14064_0.8519;T10210_0.7928;T14249_0.7786,,,Strontium titanate_0.7952415;Degradation_0.698...,,T14064
3,W4410786447,2025-05-27,article,en,S4387292547,10.52783/jier.v5i2.2824,A5117716867__T,T14064_0.9122,,,,,T14064
4,W4410778805,2025-05-27,article,en,S93195727,10.1504/ijmed.2026.10071505,A5112681361__F;A5109320604__F,T14064_0.9697,,8_0.68,,,T14064


In [7]:
def flattenList(l):
    return [item for sublist in l for item in sublist]

def generate_author2work_df(parquet_file_path):
    works_df = read_parquet(parquet_file_path, columns = ["id", "date", "authors"], quiet = True)
    
    # Step 1: Split 'authors' into lists
    works_df["authors_list"] = works_df["authors"].apply(lambda x : x.split(";"))
    
    # Step 2: Explode, but fill NaNs with "" first
    works_df["authors_list"] = works_df["authors_list"].apply(lambda x: x if isinstance(x, list) else [""])
    exploded_df = works_df.explode("authors_list")
    
    # Step 3: Parse fields (safe for empty strings)
    exploded_df["author_id"] = exploded_df["authors_list"].apply(
        lambda x: x.split("_")[0] if "_" in x else ""
    )
    
    exploded_df["primary_affiliation"] = exploded_df["authors_list"].apply(
        lambda x: x.split("_")[1].split("|")[0] if "_" in x and "|" in x.split("_")[1] else (
            x.split("_")[1] if "_" in x else ""
        )
    )
    
    exploded_df["is_corresponding"] = exploded_df["authors_list"].apply(
        lambda x: x.split("_")[2] == "T" if "_" in x else False
    )
    
    # Step 4: Select final columns
    author_df = exploded_df[["author_id", "id", "date", "primary_affiliation","is_corresponding"]].rename(columns={"id": "work_id"})
    
    return author_df

def create_author2work_df(origin_works_parquet_file_path, destination_topic_parquet_file_path, compression='brotli', do_peek = True, do_print = True):
    author2work_df = generate_author2work_df(origin_works_parquet_file_path)
    author2work_df.to_parquet(destination_topic_parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully generated {destination_topic_parquet_file_path} using {origin_works_parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(destination_topic_parquet_file_path)

In [8]:
import pandas as pd

def flattenList(l):
    return [item for sublist in l for item in sublist]

def slow_generate_author2work_df(parquet_file_path): # slower but lower memory consumption
    works_df = read_parquet(parquet_file_path, columns=["id", "date", "authors"], quiet=True)

    records = []

    for _, row in works_df.iterrows():
        work_id = row["id"]
        date = row["date"]
        authors_str = row.get("authors", "")
        
        if not isinstance(authors_str, str):
            continue

        authors = authors_str.split(";")
        for a in authors:
            if "_" not in a:
                # empty or malformed
                records.append({
                    "author_id": "",
                    "work_id": work_id,
                    "date": date,
                    "primary_affiliation": "",
                    "is_corresponding": False
                })
                continue

            parts = a.split("_")
            author_id = parts[0] if len(parts) > 0 else ""
            inst_block = parts[1] if len(parts) > 1 else ""
            corr_flag = parts[2] if len(parts) > 2 else "F"

            # Handle multiple institutions and get first
            primary_affiliation = inst_block.split("|")[0] if inst_block else ""

            records.append({
                "author_id": author_id,
                "work_id": work_id,
                "date": date,
                "primary_affiliation": primary_affiliation,
                "is_corresponding": corr_flag == "T"
            })

    author_df = pd.DataFrame.from_records(records)
    return author_df


In [10]:
for topic in tqdm(topics):
    origin_works_parquet_file_path = works_by_topic_parquet_folder+topic+".parquet"
    destination_topic_parquet_file_path = author2work_by_topic_parquet_folder+topic+".parquet"
    create_author2work_df(origin_works_parquet_file_path, destination_topic_parquet_file_path, do_peek = False, do_print = False)

In [11]:
peek_parquet(destination_topic_parquet_file_path)

Name: 'T10048'
Path: 'data/author2work_by_topic_parquet/T10048.parquet'
Files: 1
Rows: 12,494,250
Schema:
    author_id: large_string
    work_id: large_string
    date: large_string
    primary_affiliation: large_string
    is_corresponding: bool
    -- schema metadata --
    pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 867
5 random rows:


Unnamed: 0,author_id,work_id,date,primary_affiliation,is_corresponding
0,A5106618628,W4410780194,2025-05-27,,False
1,A5090085448,W4410780194,2025-05-27,,False
2,A5092418453,W4410780194,2025-05-27,,False
3,A5100637283,W4410780194,2025-05-27,,False
4,A5062215212,W4410780194,2025-05-27,,False
