# Library

In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


## Functions to convert

In [3]:
def convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator=',', compression='brotli', do_peek = True, do_print = True):
    os.makedirs(os.path.dirname(parquet_file_path), exist_ok=True)
    df = pl.read_csv(csv_file_path, separator=separator, infer_schema_length=1000)
    if sort_by is not None:
        # sort first by first column name in title, than second if equal, etc.
        df = df.sort(sort_by, descending = True)
    df.write_parquet(parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully converted {csv_file_path} to {parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(parquet_file_path)

# Conversion

## Authors

In [8]:
csv_file_path = "data/authors/authors.csv"
parquet_file_path = "data/authors/authors.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["h_index", "cited_by_count", "works_count"], separator = ',')

Successfully converted data/authors/authors.csv to data/authors/authors.parquet.
Here's a peek.
Name: 'authors'
Path: 'data/authors/authors.parquet'
Files: 1
Rows: 107,541,376
Schema:
    id: large_string
    display_name: large_string
    orcid: large_string
    works_count: int64
    h_index: int64
    cited_by_count: int64
    topics: large_string
    affiliations: large_string
    works_count_by_year: large_string
    cited_by_count_by_year: large_string
5 random rows:


Unnamed: 0,id,display_name,orcid,works_count,h_index,cited_by_count,topics,affiliations,works_count_by_year,cited_by_count_by_year
0,A5100376569,Zhong Lin Wang,0000-0002-5530-0380,3868,318,458687,T10338_2059;T10660_1274;T11230_475;T10914_432;...,I63966007_US,1_1979;1_1984;1_1986;8_1987;5_1988;11_1989;8_1...,5_1984;9_1986;240_1987;98_1988;264_1989;116_19...
1,A5083991449,Walter C. Willett,0000-0003-1458-7597,2715,312,398222,T10866_1195;T10010_438;T12267_402;T12103_356;T...,I1283280774_US;I136199984_US;I2801851002_US;I4...,1_1976;1_1977;3_1980;7_1981;1_1982;5_1983;10_1...,21_1977;78_1980;356_1981;6_1982;833_1983;1959_...
2,A5075296559,Guido Kroemer,0000-0002-9334-4405,2303,281,364156,T10587_392;T10294_332;T10158_318;T10580_317;T1...,I1294671590_FR;I154526488_FR;I185839726_FR;I20...,1_1979;1_1982;1_1986;3_1987;8_1988;5_1989;11_1...,4_1979;50_1987;78_1988;340_1989;358_1990;369_1...
3,A5109160404,Michaël Grätzel,,1896,280,391876,T10024_762;T10078_670;T10247_519;T10321_390;T1...,I105140100_ES_2012_2010;I107639228_US_1975_197...,6_1973;3_1974;6_1975;8_1976;6_1977;10_1978;15_...,194_1973;466_1974;266_1975;123_1976;92_1977;86...
4,A5057810294,Ronald C. Kessler,0000-0003-4831-2305,1649,276,350337,T10272_472;T10182_357;T10376_239;T10242_231;T1...,I136199984_US;I2801851002_US;I36258959_US,1_1963;2_1970;1_1975;2_1976;4_1977;2_1978;9_19...,6_1970;8_1975;181_1976;415_1977;762_1978;983_1...


## Institutions

In [9]:
csv_file_path = "data/institutions/institutions.csv"
parquet_file_path = "data/institutions/institutions.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = "works_count", separator = ';')

Successfully converted data/institutions/institutions.csv to data/institutions/institutions.parquet.
Here's a peek.
Name: 'institutions'
Path: 'data/institutions/institutions.parquet'
Files: 1
Rows: 120,658
Schema:
    id: large_string
    ror_id: large_string
    display_name: large_string
    type: large_string
    country_code: large_string
    country: large_string
    city: large_string
    coord: large_string
    works_count: int64
5 random rows:


Unnamed: 0,id,ror_id,display_name,type,country_code,country,city,coord,works_count
0,I100005738,047rhhm47,Brigham Young University,education,US,United States,Provo,40.23384094238281_-111.65853118896484,0
1,I100019715,04m1j1t24,Australian Institute of Criminology,government,AU,Australia,Canberra,-35.314842224121094_149.13067626953125,0
2,I100051649,00apdsa62,Privolzhsky Research Medical University,education,RU,Russia,Nizhny Novgorod,56.32921600341797_44.009483337402344,0
3,I100063501,01qz7fr76,Brighton and Sussex Medical School,education,GB,United Kingdom,Brighton,50.8650016784668_-0.08500000089406967,0
4,I100066346,04vnq7t77,University of Stuttgart,education,DE,Germany,Stuttgart,48.782318115234375_9.177020072937012,0


## Sources

In [10]:
csv_file_path = "data/sources/sources.csv"
parquet_file_path = "data/sources/sources.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = "works_count", separator = ';')

Successfully converted data/sources/sources.csv to data/sources/sources.parquet.
Here's a peek.
Name: 'sources'
Path: 'data/sources/sources.parquet'
Files: 1
Rows: 255,250
Schema:
    id: large_string
    display_name: large_string
    issn: large_string
    type: large_string
    publisher: large_string
    host_organization: large_string
    country_code: large_string
    apc_usd: large_string
    works_count: int64
    cited_by_count: int64
    h_index: int64
    i10_index: int64
    2yr_h_index: large_string
    2yr_i10_index: large_string
    2yr_mean_citedness: double
    is_oa: large_string
    is_in_doaj: large_string
    is_core: large_string
    is_indexed_in_scopus: large_string
5 random rows:


Unnamed: 0,id,display_name,issn,type,publisher,host_organization,country_code,apc_usd,works_count,cited_by_count,h_index,i10_index,2yr_h_index,2yr_i10_index,2yr_mean_citedness,is_oa,is_in_doaj,is_core,is_indexed_in_scopus
0,S7407052672,NIFS,,repository,,National Institute for Fusion Science,,,28552162,0,0,0,,,0.0,F,F,F,
1,S4306400562,Zenodo (CERN European Organization for Nuclear...,,repository,,CERN European Organization for Nuclear Research,,,10147031,1250513,221,21500,,,0.063284,T,F,F,
2,S4306525036,PubMed,,repository,,,US,,7891251,46362456,939,965613,,,1.275151,F,F,F,
3,S4377196541,Internet Archive (Internet Archive),,repository,,,,,7808691,785301,305,11526,,,0.018623,F,F,F,
4,S4306400572,OPAL (Open@LaTrobe) (La Trobe University),,repository,,La Trobe University,,,6743036,591312,197,8617,,,0.03426,F,F,F,


## Publishers

In [11]:
csv_file_path = "data/publishers/publishers.csv"
parquet_file_path = "data/publishers/publishers.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = "works_count", separator = ';')

Successfully converted data/publishers/publishers.csv to data/publishers/publishers.parquet.
Here's a peek.
Name: 'publishers'
Path: 'data/publishers/publishers.parquet'
Files: 1
Rows: 10,703
Schema:
    id: large_string
    display_name: large_string
    parent_publisher: large_string
    works_count: int64
    cited_by_count: int64
5 random rows:


Unnamed: 0,id,display_name,parent_publisher,works_count,cited_by_count
0,P4310320990,Elsevier BV,,23598446,622938130
1,P4310320595,Wiley,,10838394,242850675
2,P4310319900,Springer Science+Business Media,P4310319965,8123358,153833249
3,P4310311648,Oxford University Press,P4310311647,4797691,106429269
4,P4310320547,Taylor & Francis,,4793689,76694442


## Funders

In [12]:
csv_file_path = "data/funders/funders.csv"
parquet_file_path = "data/funders/funders.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ['grants_count','works_count'], separator = ';')

Successfully converted data/funders/funders.csv to data/funders/funders.parquet.
Here's a peek.
Name: 'funders'
Path: 'data/funders/funders.parquet'
Files: 1
Rows: 32,437
Schema:
    id: large_string
    display_name: large_string
    country_code: large_string
    grants_count: large_string
    works_count: int64
5 random rows:


Unnamed: 0,id,display_name,country_code,grants_count,works_count
0,F4320321001,National Natural Science Foundation of China,CN,,3567548
1,F4320306076,National Science Foundation,US,,1483056
2,F4320332161,National Institutes of Health,US,,1434393
3,F4320320879,Deutsche Forschungsgemeinschaft,DE,,602570
4,F4320320300,European Commission,,,585308


## Awards

In [4]:
csv_file_path = "data/awards/awards.csv"
parquet_file_path = "data/awards/awards.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ['start_date','end_date'], separator = ';')

Successfully converted data/awards/awards.csv to data/awards/awards.parquet.
Here's a peek.
Name: 'awards'
Path: 'data/awards/awards.parquet'
Files: 1
Rows: 11,746,894
Schema:
    id: large_string
    display_name: large_string
    description: large_string
    funder_id: large_string
    funding_type: large_string
    funded_outputs_count: int64
    funded_outputs: large_string
    start_date: large_string
    end_date: large_string
    doi: large_string
5 random rows:


Unnamed: 0,id,display_name,description,funder_id,funding_type,funded_outputs_count,funded_outputs,start_date,end_date,doi
0,G4053403331,Exploring the architecture of interfaces and b...,Despite the significant architectural and plan...,F4320334609,studentship,0,,2031-03-30,2031-03-30,
1,G1977985897,Predicting Mosquito-Borne Disease Risks In Sco...,"Food Security, Mosquito. Livestock. Trypanosom...",F4320334629,studentship,0,,2028-10-06,2028-10-06,
2,G855121204,Reflections on DNA - how do topoisomerases dis...,DNA is a long double-helical molecule that bec...,F4320334629,studentship,0,,2028-10-06,2028-10-06,
3,G6183439496,Photography and Seafaring: Making Visible Mart...,This project aims to produce new visual interp...,F4320334609,studentship,0,,2028-09-29,2028-11-09,
4,G6934949649,"The history, significance and interpretative v...","In 2018, National Museums Northern Ireland (NM...",F4320334609,studentship,0,,2028-09-29,2028-09-29,


In [5]:
awards_df = read_parquet("data/awards/awards.parquet")


Reading 'awards' from 'data/awards/awards.parquet' using engine='pyarrow'
Read 11,746,894 rows from 'awards' in 38.09 sec.
Converting dtypes took 0.07 sec. Size before: 4.89GB, after: 4.89GB


Unnamed: 0,id,display_name,description,funder_id,funding_type,funded_outputs_count,funded_outputs,start_date,end_date,doi
0,G4053403331,Exploring the architecture of interfaces and b...,Despite the significant architectural and plan...,F4320334609,studentship,0,,2031-03-30,2031-03-30,
1,G1977985897,Predicting Mosquito-Borne Disease Risks In Sco...,"Food Security, Mosquito. Livestock. Trypanosom...",F4320334629,studentship,0,,2028-10-06,2028-10-06,
2,G855121204,Reflections on DNA - how do topoisomerases dis...,DNA is a long double-helical molecule that bec...,F4320334629,studentship,0,,2028-10-06,2028-10-06,


In [6]:
awards_df.funded_outputs_count.value_counts()

funded_outputs_count
0       4982301
1       4486971
2        937659
3        412332
4        235924
         ...   
319           1
733           1
676           1
399           1
3835          1
Name: count, Length: 464, dtype: int64

## Topics

In [13]:
csv_file_path = "data/topics/topics.csv"
parquet_file_path = "data/topics/topics.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["works_count", "cited_by_count", ], separator = ';')

Successfully converted data/topics/topics.csv to data/topics/topics.parquet.
Here's a peek.
Name: 'topics'
Path: 'data/topics/topics.parquet'
Files: 1
Rows: 4,516
Schema:
    topic_id: large_string
    display_name: large_string
    description: large_string
    keywords: large_string
    wikipedia: large_string
    subfield_id: int64
    subfield_name: large_string
    field_id: int64
    field_name: large_string
    domain_id: int64
    domain_name: large_string
    sibling_ids: large_string
    works_count: int64
    cited_by_count: int64
5 random rows:


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
0,T10346,Magnetic confinement fusion research,This cluster of papers covers a wide range of ...,Turbulence_Tokamak_Transport_MHD Stability_Edg...,https://en.wikipedia.org/wiki/Plasma_physics,3106,Nuclear and High Energy Physics,31,Physics and Astronomy,3,Physical Sciences,T10025_T10384_T11044_T13458_T10921_T10093_T100...,9270869,2538117
1,T12157,Geochemistry and Geologic Mapping,This cluster of papers focuses on the applicat...,Machine Learning_Mineral Prospectivity_Remote ...,https://en.wikipedia.org/wiki/Mineral_prospecting,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,3952624,1640349
2,T10451,Mycorrhizal Fungi and Plant Interactions,This cluster of papers explores the diverse in...,Mycorrhizal Fungi_Fungal Diversity_Plant Inter...,https://en.wikipedia.org/wiki/Mycorrhiza,1110,Plant Science,11,Agricultural and Biological Sciences,1,Life Sciences,T12253_T11796_T11578_T13422_T11613_T12657_T106...,3202968,1949520
3,T13370,Diverse Scientific and Economic Studies,This cluster of papers covers topics related t...,Financial Analysis_Monetary Policy_Asset Prici...,,2002,Economics and Econometrics,20,"Economics, Econometrics and Finance",2,Social Sciences,T13547_T12446_T11270_T10471_T14143_T12394_T106...,2968659,239038
4,T14423,Military Technology and Strategies,This cluster of papers covers various aspects ...,Air Force_Modernization_Warfare_Unmanned Aeria...,https://en.wikipedia.org/wiki/Air_force_modern...,2202,Aerospace Engineering,22,Engineering,3,Physical Sciences,T10944_T12560_T12696_T10069_T13200_T12158_T106...,2202490,27852


In [11]:
topics_df = read_parquet("data/topics/topics.parquet")
AI_topics_df = topics_df[topics_df.subfield_name == "Artificial Intelligence"]
AI_topics_df


Reading 'topics' from 'data/topics/topics.parquet' using engine='pyarrow'
Read 4,516 rows from 'topics' in 0.07 sec.
Converting dtypes took 0.00 sec. Size before: 0.01GB, after: 0.01GB


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
0,T10346,Magnetic confinement fusion research,This cluster of papers covers a wide range of ...,Turbulence_Tokamak_Transport_MHD Stability_Edg...,https://en.wikipedia.org/wiki/Plasma_physics,3106,Nuclear and High Energy Physics,31,Physics and Astronomy,3,Physical Sciences,T10025_T10384_T11044_T13458_T10921_T10093_T100...,9270869,2538117
1,T12157,Geochemistry and Geologic Mapping,This cluster of papers focuses on the applicat...,Machine Learning_Mineral Prospectivity_Remote ...,https://en.wikipedia.org/wiki/Mineral_prospecting,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,3952624,1640349
2,T10451,Mycorrhizal Fungi and Plant Interactions,This cluster of papers explores the diverse in...,Mycorrhizal Fungi_Fungal Diversity_Plant Inter...,https://en.wikipedia.org/wiki/Mycorrhiza,1110,Plant Science,11,Agricultural and Biological Sciences,1,Life Sciences,T12253_T11796_T11578_T13422_T11613_T12657_T106...,3202968,1949520


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
1,T12157,Geochemistry and Geologic Mapping,This cluster of papers focuses on the applicat...,Machine Learning_Mineral Prospectivity_Remote ...,https://en.wikipedia.org/wiki/Mineral_prospecting,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,3952624,1640349
37,T13650,Computational Physics and Python Applications,This cluster of papers covers a wide range of ...,Python_Scientific Computing_Statistical Modeli...,https://en.wikipedia.org/wiki/Python_(programm...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T14175_T12611_T101...,409306,622443
88,T10181,Natural Language Processing Techniques,This cluster of papers focuses on statistical ...,Statistical Machine Translation_Neural Machine...,https://en.wikipedia.org/wiki/Statistical_mach...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,283826,2999722
121,T10320,Neural Networks and Applications,This cluster of papers covers a wide range of ...,Neural Networks_Self-Organizing Maps_Backpropa...,https://en.wikipedia.org/wiki/Artificial_neura...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,247207,4351608
151,T13398,Data Analysis with R,This cluster of papers focuses on statistical ...,R language_statistical analysis_data visualiza...,https://en.wikipedia.org/wiki/R_(programming_l...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,224954,568812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4096,T13935,Mathematical Control Systems and Analysis,This cluster of papers focuses on the applicat...,Fuzzy Computing_Intelligent Systems_Soft Compu...,https://en.wikipedia.org/wiki/Fuzzy_computing,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,14815,66482
4163,T14413,Advanced Technologies in Various Fields,This cluster of papers focuses on the developm...,Graph Embedding_Visual Question Answering_Sema...,https://en.wikipedia.org/wiki/Graph_embedding,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,13107,24789
4335,T14175,Intuitionistic Fuzzy Systems Applications,This cluster of papers focuses on enhancing e-...,Intelligent Agent_E-Learning_InterCriteria Ana...,https://en.wikipedia.org/wiki/E-learning,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T12611_T101...,7863,33271
4444,T13567,AI and Multimedia in Education,This cluster of papers focuses on the optimiza...,Big Data Scheduling_Fractal Encoding_Visual Tr...,https://en.wikipedia.org/wiki/Big_data,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,4110,7024


## Works by topic

### Check all topics present

In [38]:
topics_df = read_parquet("data/topics/topics.parquet")


Reading 'topics' from 'data/topics/topics.parquet' using engine='pyarrow'
Read 4,516 rows from 'topics' in 0.02 sec.
Converting dtypes took 0.00 sec. Size before: 0.01GB, after: 0.01GB


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
0,T10346,Magnetic confinement fusion research,This cluster of papers covers a wide range of ...,Turbulence_Tokamak_Transport_MHD Stability_Edg...,https://en.wikipedia.org/wiki/Plasma_physics,3106,Nuclear and High Energy Physics,31,Physics and Astronomy,3,Physical Sciences,T10025_T10384_T11044_T13458_T10921_T10093_T100...,9270869,2538117
1,T12157,Geochemistry and Geologic Mapping,This cluster of papers focuses on the applicat...,Machine Learning_Mineral Prospectivity_Remote ...,https://en.wikipedia.org/wiki/Mineral_prospecting,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T13674_T12380_T10951_T11010_T13650_T14175_T126...,3952624,1640349
2,T10451,Mycorrhizal Fungi and Plant Interactions,This cluster of papers explores the diverse in...,Mycorrhizal Fungi_Fungal Diversity_Plant Inter...,https://en.wikipedia.org/wiki/Mycorrhiza,1110,Plant Science,11,Agricultural and Biological Sciences,1,Life Sciences,T12253_T11796_T11578_T13422_T11613_T12657_T106...,3202968,1949520


In [39]:
csv_folder = "data/works_by_topic_csv/"
files = os.listdir(csv_folder)

In [40]:
len(files)

4516

In [41]:
topics_in_folder = [file[:-4] for file in files]

In [42]:
topics_in_df = topics_df.topic_id.to_list()

In [43]:
topics_in_folder_not_in_df = [topic for topic in topics_in_folder if topic not in topics_in_df]
topics_in_df_not_in_folder = [topic for topic in topics_in_df if topic not in topics_in_folder]
len(topics_in_folder_not_in_df), len(topics_in_df_not_in_folder)

(0, 0)

### Transform to parquet

In [12]:
csv_folder = "data/works_by_topic_csv/"
parquet_folder = "data/works_by_topic_parquet/"

In [13]:
topics = [topic[:-4] for topic in os.listdir(csv_folder)]
len(topics)

4516

In [14]:
for topic in tqdm(topics):
    csv_file_path = csv_folder + topic + ".csv"
    parquet_file_path = parquet_folder + topic + ".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["date", "id", ], separator = ',', do_peek = False, do_print = False)

  0%|          | 0/4516 [00:00<?, ?it/s]

In [15]:
topic = topics[0]
parquet_file_path = parquet_folder + topic + ".parquet"
peek_parquet(parquet_file_path)

Name: 'T13871'
Path: 'data/works_by_topic_parquet/T13871.parquet'
Files: 1
Rows: 5,623
Schema:
    id: large_string
    date: large_string
    type: large_string
    language: large_string
    journal: large_string
    doi: large_string
    authors: large_string
    topics: large_string
    references: large_string
    sdg: large_string
    keywords: large_string
    awards: large_string
    primary_topic: large_string
    related_works: large_string
5 random rows:


Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,awards,primary_topic,related_works
0,W7126411087,2026-01-29,article,,S5407051178,10.63363/aijfr.2026.v07i01.3147,A5124672088__T;A5124652656__F;A5124485050__F,T13871_0.3333;T12302_0.1083;T10180_0.1036,,,Nanoparticle_0.626;Sensitivity (control system...,,T13871,
1,W7125952890,2026-01-28,article,en,S2765077160,10.1016/j.jics.2026.102451,A5100700062_I303593345_F;A5069329739_I30359334...,T13871_0.9644;T13325_0.0064;T11393_0.0059,W226416011;W1562902293;W1973730610;W1979083021...,2_0.41488516330718994,Melamine_0.9064;Silver nanoparticle_0.8041;Cop...,,T13871,
2,W7115807244,2026-01-19,dissertation,en,S4306402616,,"Kourkopoulos, Thanos__T",T13871_0.1439;T10686_0.1415;T12388_0.1083,,2_0.7618747353553772,Hazardous waste_0.8117;Prioritization_0.7344;I...,,T13871,
3,W7124706820,2026-01-17,article,en,S2737566431,10.3390/antiox15010122,A5016617714_I102064193_T;A5085248982_I10206419...,T13871_0.99;T10116_0.0025;T11107_0.0008,W1520897616;W1527262979;W1529168908;W187018469...,,Capacitation_0.8387;Sperm_0.7311;Oxidative str...,,T13871,
4,W7124573560,2026-01-17,article,en,S87018847,10.1007/s00204-025-04289-5,A5123279779_I4210109381_F;A5077061484_I4210109...,T13871_0.9977;T13325_0.0002;T12302_0.0001,W1556817153;W1605332309;W1965634284;W196979691...,,Synaptic plasticity_0.7328;Morris water naviga...,,T13871,


In [16]:
parquet_file_path = parquet_folder + topic + ".parquet"
df = read_parquet(parquet_file_path)


Reading 'T13871' from 'data/works_by_topic_parquet/T13871.parquet' using engine='pyarrow'
Read 5,623 rows from 'T13871' in 0.02 sec.
Converting dtypes took 0.08 sec. Size before: 0.00GB, after: 0.00GB


Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,awards,primary_topic,related_works
0,W7126411087,2026-01-29,article,,S5407051178,10.63363/aijfr.2026.v07i01.3147,A5124672088__T;A5124652656__F;A5124485050__F,T13871_0.3333;T12302_0.1083;T10180_0.1036,,,Nanoparticle_0.626;Sensitivity (control system...,,T13871,
1,W7125952890,2026-01-28,article,en,S2765077160,10.1016/j.jics.2026.102451,A5100700062_I303593345_F;A5069329739_I30359334...,T13871_0.9644;T13325_0.0064;T11393_0.0059,W226416011;W1562902293;W1973730610;W1979083021...,2_0.41488516330718994,Melamine_0.9064;Silver nanoparticle_0.8041;Cop...,,T13871,
2,W7115807244,2026-01-19,dissertation,en,S4306402616,,"Kourkopoulos, Thanos__T",T13871_0.1439;T10686_0.1415;T12388_0.1083,,2_0.7618747353553772,Hazardous waste_0.8117;Prioritization_0.7344;I...,,T13871,


In [17]:
df.awards.apply(lambda x : len(x)).value_counts()

awards
0      5255
23      162
47       77
71       27
22       23
46       17
95        8
94        7
70        7
119       5
142       4
69        4
117       4
118       3
93        3
166       3
143       2
236       1
141       1
191       1
238       1
92        1
167       1
45        1
164       1
165       1
212       1
189       1
91        1
Name: count, dtype: int64

In [18]:
parquet_folder = "data/works2citations_by_topic_parquet/"
parquet_file_path = parquet_folder + topic + ".parquet"
citations_df = read_parquet(parquet_file_path)


Reading 'T14028' from 'data/works2citations_by_topic_parquet/T14028.parquet' using engine='pyarrow'
Read 213,531 rows from 'T14028' in 0.09 sec.
Converting dtypes took 0.00 sec. Size before: 0.02GB, after: 0.02GB


Unnamed: 0,work_id,primary_topic,publication_date,referenced_work_id,referenced_primary_topic,referenced_publication_date
0,W4412028003,T11177,2025-07-04,W3179432328,T14028,2021-07-13
1,W4403431318,T11177,2024-10-15,W27222542,T14028,1996-01-01
2,W4392854270,T11177,2024-03-15,W1971556202,T14028,1994-10-16


In [19]:
citation_counts = citations_df.groupby("referenced_work_id").size().reset_index(name="cited_by_count")
citation_counts

Unnamed: 0,referenced_work_id,cited_by_count
0,W1000874774,1
1,W1001958261,3
2,W100421077,23
3,W1006562173,1
4,W1007648502,1
...,...,...
21284,W994834425,3
21285,W99668335,1
21286,W998146579,1
21287,W999339260,38


In [20]:
df

Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,grants,primary_topic
0,W7125826661,2026-01-27,article,en,S4210234263,10.21608/eajbse.2026.480905,,T14028_0.9744;T11386_0.0054;T12750_0.001,,,Pathogenic bacteria_0.8014;Antibacterial activ...,,T14028
1,W7125814211,2026-01-27,article,en,S205866209,10.1159/000550719,,T14028_0.9366;T10634_0.0445;T11353_0.0022,,,Sensitization_0.8468;Venom_0.7916;Immunotherap...,,T14028
2,W7125385546,2026-01-22,article,en,S127393932,10.1007/s10753-025-02444-9,,T14028_0.7949;T11103_0.0408;T11386_0.0156,W135218290;W1992450378;W2057443193;W2061473393...,,Oxidative stress_0.6667;Nitric oxide_0.6094;Gl...,,T14028
3,W7125816570,2026-01-21,article,en,S4210174320,10.21608/ejchem.2026.436382.12534,,T14028_0.4668;T11386_0.1703;T10702_0.0878,,,Seasonality_0.7431;Variation (astronomy)_0.364...,,T14028
4,W7124896153,2026-01-18,article,en,S149937609,10.1016/j.cej.2026.173056,,T14028_0.3546;T12689_0.0759;T11966_0.0645,W1480260780;W2030141189;W2110341717;W214732948...,,Melittin_0.8755;Breast cancer_0.6223;Chitosan_...,,T14028
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76921,W6903235804,1753-01-01,article,en,S4306400572,10.1021/acs.jmedchem.0c02237.s002,,T14028_0.2024;T11353_0.1372;T12477_0.1051,,,Cytotoxicity_0.7601;Oncolytic virus_0.7316;Mel...,,T14028
76922,W7037216328,1736-01-01,article,en,,,,T14028_0.2063;T11353_0.1424;T12388_0.1042,,,Power (physics)_0.2283;Identity (music)_0.207;...,,T14028
76923,W2496461194,1651-01-01,book-chapter,en,S4306463708,10.1093/oseo/instance.00011369,,T14028_0.2156,,,Environmental science_0.28023863,,T14028
76924,W7037026684,1560-01-01,article,en,,,,T14028_0.3924;T11103_0.1501;T11353_0.1348,,,Field (mathematics)_0.3673;Order (exchange)_0....,,T14028


In [37]:
works_parquet_folder = "data/works_by_topic_parquet/"
works_parquet_folder2 = "data/works_by_topic_parquet2/"
os.makedirs(works_parquet_folder2, exist_ok = True)
citations_parquet_folder = "data/works2citations_by_topic_parquet/"
references_parquet_folder = "data/works2references_by_topic_parquet/"

for topic in tqdm(topics):
    # --- load works for this topic ---
    works_path = os.path.join(works_parquet_folder, f"{topic}.parquet")
    works_df = pd.read_parquet(works_path)

    # --- load citations for this topic ---
    citations_path = os.path.join(citations_parquet_folder, f"{topic}.parquet")
    citations_df = pd.read_parquet(citations_path)

    # --- load references for this topic ---
    references_path = os.path.join(references_parquet_folder, f"{topic}.parquet")
    references_df = pd.read_parquet(references_path)

    # =========================
    # 1. CITATIONS RECEIVED
    # count rows per referenced_work_id
    # =========================
    citation_counts = (
        citations_df.groupby("referenced_work_id")
        .size()
        .reset_index(name="cited_by_count_computed")
    )

    works_df = works_df.merge(
        citation_counts,
        left_on="id",
        right_on="referenced_work_id",
        how="left"
    ).drop(columns=["referenced_work_id"])

    works_df["cited_by_count_computed"] = (
        works_df["cited_by_count_computed"].fillna(0).astype("int64")
    )

    # =========================
    # 2. REFERENCES MADE
    # count rows per work_id
    # =========================
    reference_counts = (
        references_df.groupby("work_id")
        .size()
        .reset_index(name="references_count_computed")
    )

    works_df = works_df.merge(
        reference_counts,
        left_on="id",
        right_on="work_id",
        how="left"
    ).drop(columns=["work_id"])

    works_df["references_count_computed"] = (
        works_df["references_count_computed"].fillna(0).astype("int64")
    )

    # (optional) save back
    out_path = os.path.join(works_parquet_folder2, f"{topic}.parquet")
    works_df.to_parquet(out_path, index=False)

  0%|          | 0/4516 [00:00<?, ?it/s]

In [None]:
for topic in ai_topics:
    
    columns = ["id", "date", "type", "language", "doi", "topics", "keywords", "cited_by_count_computed"]

In [36]:
works_df.sort_values("cited_by_count_computed")

Unnamed: 0,id,date,type,language,journal,doi,authors,topics,references,sdg,keywords,grants,primary_topic,cited_by_count_computed,references_count_computed
2310,W7097069474,2014-01-22,article,en,,,,T13871_0.1426;T10875_0.0898;T13325_0.0527,,,Scope (computer science)_0.539,,T13871,0,0
2257,W1528778344,2014-05-12,dissertation,en,,,,T13871_1.0;T10819_0.9841;T13325_0.968,W113515681;W1489571934;W1765688430;W1966156295...,,High-performance liquid chromatography_0.83484...,,T13871,0,47
2275,W2072779915,2014-03-30,article,en,S4210178289,10.13103/jfhs.2014.29.1.021,,T13871_0.9989;T13325_0.9948;T10180_0.9671,W1973602973;W2052894441;W2075066186;W213161989...,,Formaldehyde_0.9158947;Phenol_0.84947884;Chemi...,,T13871,0,5
2276,W3143150319,2014-03-25,article,en,S2764679879,10.7506/spkx1002-6630-201406037,,T13871_0.9992;T13325_0.9182;T12302_0.905,,,Melamine_0.86162937;Chromatography_0.6294806;H...,,T13871,0,0
2280,W4214543131,2014-03-15,article,en,S4210199966,10.1007/s00240-014-0648-1,,T13871_0.9946,,,Melamine_0.9099543;Composition (language)_0.62...,,T13871,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4289,W2168505600,2008-08-09,article,en,S186282544,10.1093/toxsci/kfn160,,T13871_0.9999;T10819_0.9819;T12388_0.9116,W44356197;W1499772973;W1564619006;W1755391355;...,,Cyanuric acid_0.9853083;Melamine_0.9117625;Che...,,T13871,449,14
4407,W2120530839,2007-09-01,article,en,S206734468,10.1177/104063870701900510,,T13871_0.9998;T12388_0.9805;T10800_0.9779,W17199429;W1597429012;W1969028305;W2006440916;...,,Azotemia_0.8790613;Lethargy_0.76694936;Outbrea...,,T13871,518,10
3910,W1985043634,2009-06-18,article,en,S111155417,10.1021/ja9037017,,T13871_0.9999;T12388_0.9914;T10819_0.9878,W1489710234;W1988214044;W1997380582;W201915723...,,Melamine_0.98524386;Chemistry_0.90361035;Cyanu...,,T13871,588,11
4454,W1601956008,2007-01-01,book,en,,10.1016/c2016-0-01687-x,,T13871_0.6736,,,Veterinary Drugs_0.52392966;Clinical toxicolog...,,T13871,660,0


In [16]:
print(df.authors.apply(lambda x : len(x)).value_counts())

authors
0    76926
Name: count, dtype: int64


In [17]:
print(df.keywords.apply(lambda x : len(x)).value_counts())

keywords
27     785
19     730
18     554
0      434
20     367
      ... 
721      1
15       1
531      1
617      1
620      1
Name: count, Length: 596, dtype: int64


## Works2text by topic

In [26]:
csv_folder = "data/works2text_by_topic_csv/"
parquet_folder = "data/works2text_by_topic_parquet/"

In [27]:
topics = [topic[:-4] for topic in os.listdir(csv_folder)]
len(topics)

4516

In [28]:
for topic in tqdm(topics):
    csv_file_path = csv_folder + topic + ".csv"
    parquet_file_path = parquet_folder + topic + ".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = ["date", "id", ], separator = ';', do_peek = False, do_print = False)

  0%|          | 0/4516 [00:00<?, ?it/s]

In [29]:
peek_parquet(parquet_file_path)

Name: 'T14028'
Path: 'data/works2text_by_topic_parquet/T14028.parquet'
Files: 1
Rows: 76,926
Schema:
    id: large_string
    date: large_string
    title: large_string
    abstract: large_string
5 random rows:


Unnamed: 0,id,date,title,abstract
0,W7125826661,2026-01-27,Antibacterial Activity of Chitosan Nanoparticl...,
1,W7125814211,2026-01-27,The Clinical Importance of Component Based Dia...,Introduction Venom immunotherapy (VIT) is the ...
2,W7125385546,2026-01-22,A Melittin-Derived Lead Compound Ameliorates S...,Abstract Severe acute pancreatitis (SAP) is a ...
3,W7125816570,2026-01-21,Biochemical Effects of Seasonal Variation and ...,
4,W7124896153,2026-01-18,Melittin nanoparticle-based bidirectional spra...,


## Works2citations by topic

In [7]:
csv_folder = "data/works2citations_by_topic_csv/"
parquet_folder = "data/works2citations_by_topic_parquet/"

In [8]:
topics = [topic[:-4] for topic in os.listdir(csv_folder)]
len(topics)

4516

In [9]:
for topic in tqdm(topics):
    csv_file_path = csv_folder + topic + ".csv"
    parquet_file_path = parquet_folder + topic + ".parquet"
    convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator = ',', do_peek = False, do_print = False)

  0%|          | 0/4516 [00:00<?, ?it/s]

In [10]:
peek_parquet(parquet_file_path)

Name: 'T14028'
Path: 'data/works2citations_by_topic_parquet/T14028.parquet'
Files: 1
Rows: 213,531
Schema:
    work_id: large_string
    primary_topic: large_string
    publication_date: large_string
    referenced_work_id: large_string
    referenced_primary_topic: large_string
    referenced_publication_date: large_string
5 random rows:


Unnamed: 0,work_id,primary_topic,publication_date,referenced_work_id,referenced_primary_topic,referenced_publication_date
0,W4412028003,T11177,2025-07-04,W3179432328,T14028,2021-07-13
1,W4403431318,T11177,2024-10-15,W27222542,T14028,1996-01-01
2,W4392854270,T11177,2024-03-15,W1971556202,T14028,1994-10-16
3,W4212810480,T11177,2022-02-17,W1971556202,T14028,1994-10-16
4,W3128463426,T11177,2021-02-12,W2048172946,T14028,1997-04-01


# Peek other files

## all_works2primary_topic

In [35]:
peek_parquet("data/all_works2primary_topic_parquet/all_works2primary_topic.parquet")

Name: 'all_works2primary_topic'
Path: 'data/all_works2primary_topic_parquet/all_works2primary_topic.parquet'
Files: 1
Rows: 306,133,392
Schema:
    id: large_string
    date: large_string
    primary_topic: large_string
    -- schema metadata --
    pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 602
5 random rows:


Unnamed: 0,id,date,primary_topic
0,W7126435799,2026-02-01,T14064
1,W7126372280,2026-02-01,T14064
2,W7126447841,2026-01-31,T14064
3,W7126389308,2026-01-31,T14064
4,W7126248061,2026-01-30,T14064


In [7]:
all_works2primary_topic_df = read_parquet("data/all_works2primary_topic_parquet/all_works2primary_topic.parquet")


Reading 'all_works2primary_topic' from 'data/all_works2primary_topic_parquet/all_works2primary_topic.parquet' using engine='pyarrow'
Read 210,864,615 rows from 'all_works2primary_topic' in 10.52 sec.
Converting dtypes took 0.14 sec. Size before: 10.00GB, after: 10.00GB


Unnamed: 0,id,date,primary_topic
0,W4410793892,2025-05-28,T14064
1,W4410791727,2025-05-27,T14064
2,W4410789294,2025-05-27,T14064


In [8]:
all_works2primary_topic_df.primary_topic.nunique()

4516

In [9]:
all_works2primary_topic_df.date.min(), all_works2primary_topic_df.date.max()

('0004-07-12', '2025-12-31')

In [10]:
all_works2primary_topic_df.date.apply(lambda x : x if x > "1" else None).min()

'1007-04-01'

## Topics

In [4]:
parquet_file_path = "data/topics/topics.parquet"
topics_df = read_parquet(parquet_file_path)


Reading 'topics' from 'data/topics/topics.parquet' using engine='pyarrow'
Read 4,516 rows from 'topics' in 0.03 sec.
Converting dtypes took 0.00 sec. Size before: 0.01GB, after: 0.01GB


Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
0,T11881,Crystallization and Solubility Studies,This cluster of papers focuses on the crystall...,Crystallization_Nucleation_Solubility_Polymorp...,https://en.wikipedia.org/wiki/Crystallization,2505,Materials Chemistry,25,Materials Science,3,Physical Sciences,T10275_T13889_T12302_T10440_T10311_T12340_T114...,975044,784318
1,T11475,French Urban and Social Studies,This cluster of papers explores the intersecti...,Territorial Governance_Environmental Participa...,https://en.wikipedia.org/wiki/Territorial_gove...,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences,T10927_T13843_T13162_T12635_T13428_T12432_T112...,652021,699274
2,T13445,American Constitutional Law and Politics,This cluster of papers explores the developmen...,American founding_Constitutional government_Re...,https://en.wikipedia.org/wiki/American_politic...,3320,Political Science and International Relations,33,Social Sciences,2,Social Sciences,T14181_T12986_T13504_T13643_T12956_T13057_T138...,465559,1814934


In [10]:
topics_df[topics_df.subfield_name.apply(lambda x : "artificial" in x.lower())]

Unnamed: 0,topic_id,display_name,description,keywords,wikipedia,subfield_id,subfield_name,field_id,field_name,domain_id,domain_name,sibling_ids,works_count,cited_by_count
29,T10181,Natural Language Processing Techniques,This cluster of papers focuses on statistical ...,Statistical Machine Translation_Neural Machine...,https://en.wikipedia.org/wiki/Statistical_mach...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,233196,3194394
54,T10320,Neural Networks and Applications,This cluster of papers covers a wide range of ...,Neural Networks_Self-Organizing Maps_Backpropa...,https://en.wikipedia.org/wiki/Artificial_neura...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,205089,4257999
80,T13734,Advanced Computational Techniques and Applicat...,This cluster of papers covers a wide range of ...,Expert Systems_Wavelet Analysis_Machine Learni...,https://en.wikipedia.org/wiki/Artificial_intel...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T11273_T10639_T14413_T13083_T11689_T135...,179068,192945
98,T12157,Geochemistry and Geologic Mapping,This cluster of papers focuses on the applicat...,Machine Learning_Mineral Prospectivity_Remote ...,https://en.wikipedia.org/wiki/Mineral_prospecting,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,164104,1500551
125,T10215,Semantic Web and Ontologies,This cluster of papers focuses on the Semantic...,Semantic Web_Ontology_Linked Data_RDF_OWL_Sche...,https://en.wikipedia.org/wiki/Semantic_Web,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,154257,1470833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4303,T14175,Intuitionistic Fuzzy Systems Applications,This cluster of papers focuses on enhancing e-...,Intelligent Agent_E-Learning_InterCriteria Ana...,https://en.wikipedia.org/wiki/E-learning,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,3213,31320
4347,T13567,AI and Multimedia in Education,This cluster of papers focuses on the optimiza...,Big Data Scheduling_Fractal Encoding_Visual Tr...,https://en.wikipedia.org/wiki/Big_data,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,2569,5596
4371,T13904,Artificial Intelligence Applications,This cluster of papers covers a wide range of ...,Artificial Intelligence_Neural Networks_Deep L...,https://en.wikipedia.org/wiki/Artificial_intel...,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,2175,4082
4393,T13898,Diverse Interdisciplinary Research Studies,This cluster of papers covers a wide range of ...,Cybernetics_Information Theory_Risk Management...,https://en.wikipedia.org/wiki/Cybernetics,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,T10637_T13734_T11273_T10639_T14413_T13083_T116...,1710,16600
