# Library

In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


## Functions to convert

In [3]:
def convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator=',', compression='brotli', do_peek = True, do_print = True):
    os.makedirs(os.path.dirname(parquet_file_path), exist_ok=True)
    df = pl.read_csv(csv_file_path, separator=separator, infer_schema_length=1000)
    if sort_by is not None:
        # sort first by first column name in title, than second if equal, etc.
        df = df.sort(sort_by, descending = True)
    df.write_parquet(parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully converted {csv_file_path} to {parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(parquet_file_path)

# Conversion

## Authors

In [4]:
csv_file_path = "data/authors/authors.csv"
parquet_file_path = "data/authors/authors.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["h_index", "cited_by_count", "works_count"], separator = ',')

Successfully converted data/authors/authors.csv to data/authors/authors.parquet.
Here's a peek.
Name: 'authors'
Path: 'data/authors/authors.parquet'
Files: 1
Rows: 103,480,180
Schema:
    id: large_string
    display_name: large_string
    orcid: large_string
    works_count: int64
    h_index: int64
    cited_by_count: int64
    topics: large_string
    affiliations: large_string
    works_count_by_year: large_string
    cited_by_count_by_year: large_string
5 random rows:


Unnamed: 0,id,display_name,orcid,works_count,h_index,cited_by_count,topics,affiliations,works_count_by_year,cited_by_count_by_year
0,A5083991449,Walter C. Willett,0000-0003-1458-7597,2729,308,385164,T10866_1187;T10010_435;T12267_392;T12103_358;T...,I1283280774_US_2025_2024_2023_2022_2021_2020_2...,23_2025;83_2024;160_2023;67_2022;119_2021;89_2...,18803_2025;55033_2024;60817_2023;62246_2022;64...
1,A5100376569,Zhong Lin Wang,0000-0002-5530-0380,4052,304,428474,T10338_2014;T10660_1239;T11230_470;T10914_419;...,I130701444_US_2025_2024_2023_2022_2021_2020_20...,128_2025;287_2024;315_2023;288_2022;262_2021;2...,49817_2025;139444_2024;145407_2023;131862_2022...
2,A5057810294,Ronald C. Kessler,0000-0003-4831-2305,1661,276,337223,T10272_462;T10182_351;T10376_235;T10242_231;T1...,I2801851002_US_2025_2024_2023_2022_2021_2020_2...,21_2025;63_2024;77_2023;69_2022;96_2021;74_202...,8560_2025;25002_2024;27708_2023;27828_2022;306...
3,A5109160404,Michaël Grätzel,,1906,273,380943,T10024_765;T10078_673;T10247_522;T10321_391;T1...,I5124864_CH_2025_2024_2023_2022_2021_2020_2019...,6_2025;26_2024;55_2023;48_2022;64_2021;45_2020...,7561_2025;26600_2024;29672_2023;33115_2022;400...
4,A5075296559,Guido Kroemer,0000-0002-9334-4405,2263,272,337227,T10587_390;T10294_338;T10580_319;T10158_311;T1...,I204730241_FR_2025_2024_2023_2022_2021_2020_20...,31_2025;71_2024;393_2023;95_2022;102_2021;111_...,71897_2025;199824_2024;199503_2023;180567_2022...


## Institutions

In [31]:
csv_file_path = "data/institutions/institutions.csv"
parquet_file_path = "data/institutions/institutions.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = "works_count", separator = ';')

Successfully converted data/institutions/institutions.csv to data/institutions/institutions.parquet. Here's a peek.
Name: 'institutions'
Path: 'data/institutions/institutions.parquet'
Files: 1
Rows: 114,883
Schema:
    id: large_string
    display_name: large_string
    type: large_string
    country_code: large_string
    country: large_string
    city: large_string
    coord: large_string
    works_count: int64
5 random rows:


Unnamed: 0,id,display_name,type,country_code,country,city,coord,works_count
0,I1294671590,Centre National de la Recherche Scientifique,government,FR,France,Paris,48.85341_2.3488,1099006
1,I27837315,University of Michigan,funder,US,United States,Ann Arbor,42.27756_-83.74088,931869
2,I19820366,Chinese Academy of Sciences,government,CN,China,Beijing,39.9075_116.39723,853262
3,I136199984,Harvard University,funder,US,United States,Cambridge,42.3751_-71.10561,678421
4,I185261750,University of Toronto,funder,CA,Canada,Toronto,43.70643_-79.39864,496751


## Funders

In [32]:
csv_file_path = "data/funders/funders.csv"
parquet_file_path = "data/funders/funders.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ['grants_count','works_count'], separator = ';')

Successfully converted data/funders/funders.csv to data/funders/funders.parquet. Here's a peek.
Name: 'funders'
Path: 'data/funders/funders.parquet'
Files: 1
Rows: 32,437
Schema:
    id: large_string
    display_name: large_string
    country_code: large_string
    grants_count: int64
    works_count: int64
5 random rows:


Unnamed: 0,id,display_name,country_code,grants_count,works_count
0,F4320321001,National Natural Science Foundation of China,CN,717144,2131044
1,F4320332161,National Institutes of Health,US,272709,385181
2,F4320306076,National Science Foundation,US,208672,393437
3,F4320334764,Japan Society for the Promotion of Science,JP,158347,217751
4,F4320320879,Deutsche Forschungsgemeinschaft,DE,124074,241608


## Topics

In [33]:
csv_file_path = "data/topics/topics.csv"
parquet_file_path = "data/topics/topics.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["works_count", "cited_by_count", ], separator = ';')

Successfully converted data/topics/topics.csv to data/topics/topics.parquet. Here's a peek.
Name: 'topics'
Path: 'data/topics/topics.parquet'
Files: 1
Rows: 4,516
Schema:
    topic_id: large_string
    display_name: large_string
    description: large_string
    keywords: large_string
    wikipedia: large_string
    subfield_id: int64
    subfield_name: large_string
    field_id: int64
    field_name: large_string
    domain_id: int64
    domain_name: large_string
    sibling_ids: large_string
    works_count: int64
    cited_by_count: int64
5 random rows:


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
0,T11881,Crystallization and Solubility Studies,This cluster of papers focuses on the crystall...,Crystallization_Nucleation_Solubility_Polymorp...,https://en.wikipedia.org/wiki/Crystallization,2505,Materials Chemistry,25,Materials Science,3,Physical Sciences,T10275_T13889_T12302_T10440_T10311_T12340_T114...,975044,784318
1,T11475,French Urban and Social Studies,This cluster of papers explores the intersecti...,Territorial Governance_Environmental Participa...,https://en.wikipedia.org/wiki/Territorial_gove...,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences,T10927_T13843_T13162_T12635_T13428_T12432_T112...,652021,699274
2,T13445,American Constitutional Law and Politics,This cluster of papers explores the developmen...,American founding_Constitutional government_Re...,https://en.wikipedia.org/wiki/American_politic...,3320,Political Science and International Relations,33,Social Sciences,2,Social Sciences,T14181_T12986_T13504_T13643_T12956_T13057_T138...,465559,1814934
3,T10165,Classical Antiquity Studies,This cluster of papers explores various aspect...,Ancient Rome_Greek Literature_Economic History...,https://en.wikipedia.org/wiki/Ancient_Mediterr...,3314,Anthropology,33,Social Sciences,2,Social Sciences,T14395_T13019_T10149_T14289_T13939_T13705_T140...,393389,1606573
4,T10362,Biblical Studies and Interpretation,This cluster of papers focuses on biblical stu...,Biblical Monotheism_Jewish Society_Christian I...,https://en.wikipedia.org/wiki/Biblical_studies,1212,Religious studies,12,Arts and Humanities,2,Social Sciences,T12470_T13347_T13072_T13818_T14063_T13356_T143...,340922,1051055


## Works by topic

### Check all topics present

In [34]:
topics_df = read_parquet("data/topics/topics.parquet")


Reading 'topics' from 'data/topics/topics.parquet' using engine='pyarrow'
Read 4,516 rows from 'topics' in 0.03 sec.
Converting dtypes took 0.01 sec. Size before: 0.01GB, after: 0.01GB


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
0,T11881,Crystallization and Solubility Studies,This cluster of papers focuses on the crystall...,Crystallization_Nucleation_Solubility_Polymorp...,https://en.wikipedia.org/wiki/Crystallization,2505,Materials Chemistry,25,Materials Science,3,Physical Sciences,T10275_T13889_T12302_T10440_T10311_T12340_T114...,975044,784318
1,T11475,French Urban and Social Studies,This cluster of papers explores the intersecti...,Territorial Governance_Environmental Participa...,https://en.wikipedia.org/wiki/Territorial_gove...,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences,T10927_T13843_T13162_T12635_T13428_T12432_T112...,652021,699274
2,T13445,American Constitutional Law and Politics,This cluster of papers explores the developmen...,American founding_Constitutional government_Re...,https://en.wikipedia.org/wiki/American_politic...,3320,Political Science and International Relations,33,Social Sciences,2,Social Sciences,T14181_T12986_T13504_T13643_T12956_T13057_T138...,465559,1814934


In [35]:
csv_folder = "data/works_by_topic_csv/"
files = os.listdir(csv_folder)

In [36]:
len(files)

4516

In [37]:
topics_in_folder = [file[:-4] for file in files]

In [38]:
topics_in_df = topics_df.topic_id.to_list()

In [39]:
topics_in_folder_not_in_df = [topic for topic in topics_in_folder if topic not in topics_in_df]
topics_in_df_not_in_folder = [topic for topic in topics_in_df if topic not in topics_in_folder]
len(topics_in_folder_not_in_df), len(topics_in_df_not_in_folder)

(0, 0)

### Transform to parquet

In [10]:
csv_folder = "data/works_by_topic_csv/"
parquet_folder = "data/works_by_topic_parquet/"

In [54]:
topics = [topic[:-4] for topic in os.listdir(csv_folder)]
len(topics)

4516

In [47]:
for topic in tqdm(topics):
    csv_file_path = csv_folder + topic + ".csv"
    parquet_file_path = parquet_folder + topic + ".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["date", "id", ], separator = ',', do_peek = False, do_print = False)

100%|██████████| 4516/4516 [20:28<00:00,  3.68it/s] 


In [60]:
peek_parquet(parquet_file_path)

Name: 'T10091'
Path: 'data/works_by_topic_parquet/T10091.parquet'
Files: 1
Rows: 98,177
Schema:
    id: large_string
    date: large_string
    type: large_string
    language: large_string
    journal: large_string
    doi: large_string
    authors: large_string
    topics: large_string
    references: large_string
    sdg: large_string
    keywords: large_string
    grants: large_string
    primary_topic: large_string
5 random rows:


Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,grants,primary_topic
0,W4410794208,2025-05-28,article,en,S124275759,10.1093/jac/dkaf141,A5037958817_I11701301|I4210118299_F;A501492437...,T10091_0.9985;T12649_0.9983;T12497_0.9928,W1969503178;W1971147414;W1976977157;W198304349...,3_0.85,Mitochondrial respiratory chain_0.42389745,INV-039628_F4320306137;1R01AI152533_F432030613...,T10091
1,W4410795146,2025-05-27,peer-review,en,,10.7554/elife.103047.3.sa3,A5000164074_I4210087127|I19894307_F;A502629686...,T10091_0.9142;T10166_0.9105,,2_0.72,Variation (astronomy)_0.6521116,,T10091
2,W4410780759,2025-05-27,article,en,S4210167893,10.1038/s43856-025-00905-8,A5005536355__F;A5049005726__F;A5007388521__F;A...,T10091_0.9967;T10166_0.9919,W1499342446;W1919347439;W1971112779;W199314404...,3_0.44,,NSTC 113-2636-B-007-006_F4320322795,T10091
3,W4410779100,2025-05-27,article,en,S47215897,10.1128/aac.00162-25,A5067425574_I2613432_F;A5117714511_I2613432_F;...,T10091_0.9999;T10211_0.9932;T10695_0.976,W1969503178;W1993347313;W2020251171;W204071355...,14_0.69,,U19AI181593_F4320332161,T10091
4,W4410772738,2025-05-27,article,en,S1336409049,10.7554/elife.103047.3,A5000164074_I154526488|I19894307|I1294671590|I...,T10091_0.9967;T10166_0.99;T12047_0.9599,W1499342446;W1604550862;W1755904813;W196897028...,,Variation (astronomy)_0.5012531,INDIE OPP1173572_F4320306137;MRC/LSHTM fellows...,T10091


In [11]:
parquet_file_path = parquet_folder + topic + ".parquet"
df = read_parquet(parquet_file_path)


Reading 'T14028' from 'data/works_by_topic_parquet/T14028.parquet' using engine='pyarrow'
Read 74,452 rows from 'T14028' in 0.19 sec.
Converting dtypes took 0.01 sec. Size before: 0.02GB, after: 0.02GB


Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,grants,primary_topic
0,W4410784842,2025-05-27,article,,S4210228897,10.1024/1861-6186/a000873,A5117716422__T,T14028_0.1716,,,,,T14028
1,W4410775830,2025-05-27,preprint,en,S4306402567,10.1101/2025.05.22.655489,A5101674706__F;A5114040975__F;A5043550279__F;A...,T14028_0.9697;T10423_0.9608,,,BETA (programming language)_0.55140626,,T14028
2,W4410775507,2025-05-27,article,en,S157347057,10.21873/anticanres.17621,A5046763596_I138006243_F;A5005875633_I13800624...,T14028_0.9318,,,Myxofibrosarcoma_0.7983122;Neoadjuvant Therapy...,,T14028


In [15]:
print(df.type.unique().to_list())

['article', 'preprint', 'letter', 'review', 'other', 'erratum', 'report', 'peer-review', 'dataset', 'book-chapter', 'dissertation', 'book', 'editorial', 'retraction', 'paratext', 'reference-entry', 'libguides']


  print(df.type.unique().to_list())


## Works2text by topic

In [6]:
csv_folder = "data/works2text_by_topic_csv/"
parquet_folder = "data/works2text_by_topic_parquet/"

In [7]:
topics = [topic[:-4] for topic in os.listdir(csv_folder)]
len(topics)

4516

In [8]:
for topic in tqdm(topics):
    csv_file_path = csv_folder + topic + ".csv"
    parquet_file_path = parquet_folder + topic + ".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["date", "id", ], separator = ';', do_peek = False, do_print = False)

100%|██████████| 4516/4516 [32:05<00:00,  2.34it/s] 


In [9]:
peek_parquet(parquet_file_path)

Name: 'T14028'
Path: 'data/works2text_by_topic_parquet/T14028.parquet'
Files: 1
Rows: 74,452
Schema:
    id: large_string
    date: large_string
    title: large_string
    abstract: large_string
5 random rows:


Unnamed: 0,id,date,title,abstract
0,W4410784842,2025-05-27,Heldin,NONE
1,W4410775830,2025-05-27,Drosophila Metaxin-2 controls beta-barrel biog...,Metaxin-2 (Mtx2) is an evolutionarily conserve...
2,W4410775507,2025-05-27,Pathologic Complete Response (pCR) in Patient ...,Many patients with cancer actively explore com...
3,W4410770537,2025-05-27,Distribution Analysis for Diagnostics and Ther...,NONE
4,W4410766772,2025-05-27,Mechanism of SK2 channel gating and its modula...,Small-conductance calcium-activated potassium ...


## Works2citations by topic

In [11]:
csv_folder = "data/works2citations_by_topic_csv/"
parquet_folder = "data/works2citations_by_topic_parquet/"

In [12]:
topics = [topic[:-4] for topic in os.listdir(csv_folder)]
len(topics)

4516

In [14]:
for topic in tqdm(topics):
    csv_file_path = csv_folder + topic + ".csv"
    parquet_file_path = parquet_folder + topic + ".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator = ',', do_peek = False, do_print = False)

100%|██████████| 4516/4516 [13:26<00:00,  5.60it/s]


In [15]:
peek_parquet(parquet_file_path)

Name: 'T14028'
Path: 'data/works2citations_by_topic_parquet/T14028.parquet'
Files: 1
Rows: 206,795
Schema:
    work_id: large_string
    primary_topic: large_string
    publication_date: large_string
    referenced_work_id: large_string
    referenced_primary_topic: large_string
    referenced_publication_date: large_string
5 random rows:


Unnamed: 0,work_id,primary_topic,publication_date,referenced_work_id,referenced_primary_topic,referenced_publication_date
0,W4403431318,T11177,2024-10-15,W27222542,T14028,1996-01-01
1,W4392854270,T11177,2024-03-15,W1971556202,T14028,1994-10-16
2,W4212810480,T11177,2022-02-17,W1971556202,T14028,1994-10-16
3,W3128463426,T11177,2021-02-12,W2048172946,T14028,1997-04-01
4,W2951012447,T11177,2019-01-01,W1501032637,T14028,1982-06-01


# Peek other files

In [6]:
peek_parquet("data/all_works2primary_topic_parquet/all_works2primary_topic.parquet")

Name: 'all_works2primary_topic'
Path: 'data/all_works2primary_topic_parquet/all_works2primary_topic.parquet'
Files: 1
Rows: 210,864,615
Schema:
    id: large_string
    date: large_string
    primary_topic: large_string
    -- schema metadata --
    pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 602
5 random rows:


Unnamed: 0,id,date,primary_topic
0,W4410793892,2025-05-28,T14064
1,W4410791727,2025-05-27,T14064
2,W4410789294,2025-05-27,T14064
3,W4410786447,2025-05-27,T14064
4,W4410778805,2025-05-27,T14064


In [7]:
all_works2primary_topic_df = read_parquet("data/all_works2primary_topic_parquet/all_works2primary_topic.parquet")


Reading 'all_works2primary_topic' from 'data/all_works2primary_topic_parquet/all_works2primary_topic.parquet' using engine='pyarrow'
Read 210,864,615 rows from 'all_works2primary_topic' in 10.52 sec.
Converting dtypes took 0.14 sec. Size before: 10.00GB, after: 10.00GB


Unnamed: 0,id,date,primary_topic
0,W4410793892,2025-05-28,T14064
1,W4410791727,2025-05-27,T14064
2,W4410789294,2025-05-27,T14064


In [8]:
all_works2primary_topic_df.primary_topic.nunique()

4516

In [9]:
all_works2primary_topic_df.date.min(), all_works2primary_topic_df.date.max()

('0004-07-12', '2025-12-31')

In [10]:
all_works2primary_topic_df.date.apply(lambda x : x if x > "1" else None).min()

'1007-04-01'