In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [3]:
works_by_topic_parquet_folder = "data/works_by_topic_parquet/"
author2work_by_topic_csv_folder = "data/author2work_by_topic_csv/"
os.makedirs(author2work_by_topic_csv_folder, exist_ok=True)

In [4]:
topics = [topic[:-8] for topic in os.listdir(works_by_topic_parquet_folder)]

In [5]:
len(topics)

4516

In [8]:
def generate_author2work_df(works_path, out_path, batch_size=1000, do_print=True):
    # Build author2work by processing works in batches to limit memory.
    # Append to CSV (header only on first write).
    df = read_parquet(works_path, columns=['id', 'date', 'authors'], quiet=True)
    total = len(df)

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    first_write = True

    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        batch = df.iloc[start:end].copy()

        batch = batch[batch['authors'].apply(lambda x: len(x) > 0)]
        batch['authors'] = batch['authors'].apply(lambda x: x.split(';'))
        batch = batch.explode('authors')
    
        def parse_author(x):
            parts = x.split('_')
            # primary_affiliation = ''
            affs = ''
            author_id = ''
            is_corr = 'F'
            if len(parts) > 2:
                author_id = "_".join(parts[:-2]) if len(parts) > 2 else ''
                affs = parts[-2]#.split('|') if parts[1] else []
                # if len(affs) > 0:
                #     primary_affiliation = affs[0]
                is_corr = parts[-1]
            return author_id, affs, is_corr

        parsed = batch['authors'].apply(parse_author)
        batch['author_id'] = parsed.apply(lambda x: x[0])
        # batch['primary_affiliation'] = parsed.apply(lambda x: x[1])
        batch['affiliations'] = parsed.apply(lambda x: x[1])
        batch['is_corresponding'] = parsed.apply(lambda x: x[2])

        batch = batch.rename(columns={'id': 'work_id'})
        batch = batch[['author_id', 'work_id', 'date', 'affiliations', 'is_corresponding']]

        batch.to_csv(out_path, mode='a', header=first_write, index=False)
        first_write = False

        if do_print:
            print(f'Processed rows {start}-{end} / {total}')

In [17]:
if os.listdir(author2work_by_topic_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(author2work_by_topic_csv_folder)
    raise RuntimeError(
        f"Destination folder '{author2work_by_topic_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

for topic in tqdm(topics):
    origin_works_parquet_file_path = works_by_topic_parquet_folder+topic+".parquet"
    destination_topic_csv_file_path = author2work_by_topic_csv_folder+topic+".csv"
    generate_author2work_df(origin_works_parquet_file_path, destination_topic_csv_file_path, batch_size = 10000, do_print = False)

  0%|          | 0/4516 [00:00<?, ?it/s]

In [14]:
def convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator=',', compression='brotli', do_peek = True, do_print = True):
    os.makedirs(os.path.dirname(parquet_file_path), exist_ok=True)
    df = pl.read_csv(csv_file_path, separator=separator, infer_schema_length=1000)
    if sort_by is not None:
        # sort first by first column name in title, than second if equal, etc.
        df = df.sort(sort_by, descending = True)
    df.write_parquet(parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully converted {csv_file_path} to {parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(parquet_file_path)

In [15]:
author2work_by_topic_parquet_folder = "data/author2work_by_topic_parquet/"


In [18]:
for topic in tqdm(topics):
    csv_file_path = author2work_by_topic_csv_folder+topic+".csv"
    parquet_file_path = author2work_by_topic_parquet_folder+topic+".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator = ',', do_peek = False, do_print = False)
peek_parquet(parquet_file_path)

  0%|          | 0/4516 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [19]:
peek_parquet(parquet_file_path)

Name: 'T10091'
Path: 'data/author2work_by_topic_parquet/T10091.parquet'
Files: 1
Rows: 899,151
Schema:
    author_id: large_string
    work_id: large_string
    date: large_string
    affiliations: large_string
    is_corresponding: large_string
5 random rows:


Unnamed: 0,author_id,work_id,date,affiliations,is_corresponding
0,"Shi, Lining",W7083686573,2030-01-01,,T
1,"Pinthong, Nattapon",W6963238420,2028-01-01,,T
2,Klaus Reuter,W6929729901,2028-01-01,I4210132734,T
3,Cyril Hanus,W6929729901,2028-01-01,I154526488|I4210130152,F
4,Mateusz Sikora,W6929729901,2028-01-01,I126596746,F
