In [65]:
import json
import os
import sys

import boto3
from dotenv import load_dotenv
import pandas as pd
from tqdm.notebook import tqdm

load_dotenv()

True

In [35]:
def get_chunk_url(i: int, stage2: bool = True):
    if stage2:
        return f"https://sufficiency-library.s3.fr-par.scw.cloud/stage-2/preds/chunk_{i}.parquet"
    else:
        return f"https://sufficiency-library.s3.fr-par.scw.cloud/stage-1/chunk_{i}.parquet"

In [6]:
# you needs lots of RAM for this
# I used the same compute instance used for the predictions, but you can optimize it much more by filtering as you load
res = []
for i in range(11, 25):
    url = get_chunk_url(i)
    df = pd.read_parquet(url)
    res.append(df)
    print(i, 'ok')

11 ok
12 ok
13 ok
14 ok
15 ok
16 ok
17 ok
18 ok
19 ok
20 ok
21 ok
22 ok
23 ok
24 ok


In [11]:
allpreds = pd.concat(res, ignore_index=True)

In [13]:
allpreds

Unnamed: 0,id,title,abstract,language,proba_other,proba_planetary_boundaries,proba_well_being,proba_resources,proba_justice,prescreening_high,prescreening_medium,prescreening_low,pred_class
0,W1000006914,Cardiolipin Profile Changes are Associated to ...,Brain mitochondria are fundamental to maintain...,en,0.853108,0.034522,0.039105,0.038509,0.035241,False,False,False,other
1,W100011079,Economic impact of onchocerciasis through the ...,This note overviews several studies that have ...,en,0.028584,0.491196,0.373877,0.042916,0.017946,True,False,False,planetary_boundaries
2,W1000067844,The Convention on International Trade in Endan...,The 1973 Convention on International Trade in ...,en,0.057553,0.751890,0.029672,0.050781,0.049556,True,True,False,planetary_boundaries
3,W100016882,Investigations and actions taken during 2011 d...,Echinococcus multilocularis is a parasite that...,en,0.784568,0.092691,0.034864,0.027405,0.033613,False,False,False,other
4,W1000090685,Development of a Test Procedure for Driver Ass...,Accidents between right turning trucks and str...,en,0.623844,0.330110,0.031617,0.019972,0.025730,False,False,False,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17324450,W4412403130,Curvature-dependent dynamics of a bacterium co...,We investigate the positional behavior of a si...,en,0.853631,0.033652,0.037252,0.039866,0.036489,False,False,False,other
17324451,W4412538833,Self-Attentive Transformer for Fast and Accura...,Abstract Current postprocessing techniques oft...,en,0.551830,0.438791,0.027694,0.021773,0.023525,False,False,False,other
17324452,W4287812263,Asymptotically Achieving Centralized Rate on t...,We analyze the high-SNR regime of the MxK Netw...,en,0.854295,0.035799,0.038473,0.039041,0.033799,False,False,False,other
17324453,W4411946848,An atlas of photonic and plasmonic materials f...,Abstract Cathodoluminescence (CL) microscopy h...,en,0.853834,0.034404,0.036791,0.039824,0.036168,False,False,False,other


In [15]:
print(allpreds['prescreening_low'].mean())
print(allpreds['prescreening_low'].sum())

0.14487705385248772
2509916


In [16]:
kept = allpreds[allpreds['prescreening_low']]

In [24]:
s3 = boto3.client(
        service_name="s3",
        endpoint_url="https://s3.fr-par.scw.cloud",
        aws_access_key_id=os.environ["S3_ACCESS_KEY"],
        aws_secret_access_key=os.environ["S3_SECRET_KEY"],
    )

In [25]:
kept.to_parquet('screened_preds.parquet', index=False)

In [27]:
s3.upload_file(
        Filename="screened_preds.parquet",
        Bucket="sufficiency-library",
        Key=f"stage-2/all_screened_with_preds.parquet",
    )

In [33]:
with open("screened_ids.txt", 'w', encoding='utf-8') as f:
    for i, row in tqdm(kept.iterrows(), total=len(kept)):
        value = row['id']
        f.write(str(value) + '\n')

  0%|          | 0/2509916 [00:00<?, ?it/s]

In [34]:
s3.upload_file(
        Filename="screened_ids.txt",
        Bucket="sufficiency-library",
        Key=f"stage-2/all_screened_ids.txt",
    )

# Now form final prescreening DB by taking all stage 1 data for kept ids

In [36]:
# again you can optimize this a lot more, I'm just enjoing my 189 GB of RAM
res = []
for i in range(25):
    url = get_chunk_url(i, stage2=False)
    df = pd.read_parquet(url)
    res.append(df)
    print(i, 'ok')

0 ok
1 ok
2 ok
3 ok
4 ok
5 ok
6 ok
7 ok
8 ok
9 ok
10 ok
11 ok
12 ok
13 ok
14 ok
15 ok
16 ok
17 ok
18 ok
19 ok
20 ok
21 ok
22 ok
23 ok
24 ok


In [41]:
alldata = pd.concat(res, ignore_index=True)

In [45]:
df = alldata[alldata.id.isin(kept.id)]

In [54]:
df.isna().sum()

id                       0
doi                 429268
title                    0
abstract                 0
language                 0
publication_date      1341
type                     0
open_access              0
best_oa_location    768671
has_fulltext             0
fwci                170140
created_at               0
dtype: int64

In [59]:
sys.getsizeof(df) / 1024**2

8619.808549880981

In [72]:
df.head()

Unnamed: 0,id,doi,title,abstract,language,publication_date,type,open_access,best_oa_location,has_fulltext,fwci,created_at
20,W1000221716,,Integrated management of leafy spurge,Figure 1. Leafy spurge plant in flowering grow...,en,1995-01-01,article,"{""is_oa"": true, ""oa_status"": ""green"", ""oa_url""...","{""id"": ""pmh:oai:library.ndsu.edu:10365/17688"",...",1.0,1.44102389,2025-12-01 18:23:11
21,W100006350,,Farm Consolidation in the Northern and Central...,"During the past half century, American agricul...",en,1987-01-01,article,"{""is_oa"": true, ""oa_status"": ""green"", ""oa_url""...","{""id"": ""pmh:oai:digitalcommons.unl.edu:greatpl...",1.0,0.0,2025-12-01 18:23:11
33,W1000233381,https://doi.org/10.12736/issn.2300-3022.2013212,The lower Vistula in the aspect of the E40 and...,Throughout the history of Europe and the world...,en,2013-06-03,article,"{""is_oa"": false, ""oa_status"": ""closed"", ""oa_ur...",,0.0,0.48269519,2025-12-01 18:23:11
44,W1000028557,https://doi.org/10.3133/ofr93101,"Occurrence of pesticides, nitrite plus nitrate...","During 1988, pesticides were detected in 4 of ...",en,1993-01-01,article,"{""is_oa"": true, ""oa_status"": ""bronze"", ""oa_url...","{""id"": ""doi:10.3133/ofr93101"", ""is_oa"": true, ...",1.0,0.0,2025-12-01 18:23:11
48,W1000134164,https://doi.org/10.3133/sir20055079,Feasibility of using benthic invertebrates as ...,Macroinvertebrates were collected from 19 site...,en,2005-01-01,article,"{""is_oa"": false, ""oa_status"": ""closed"", ""oa_ur...",,0.0,0.0,2025-12-01 18:23:11


In [67]:
oa = pd.json_normalize(df.open_access.apply(json.loads))

In [68]:
oa

Unnamed: 0,is_oa,oa_status,oa_url,any_repository_has_fulltext
0,True,green,http://hdl.handle.net/10365/17688,False
1,True,green,http://digitalcommons.unl.edu/cgi/viewcontent....,False
2,False,closed,,False
3,True,bronze,https://pubs.usgs.gov/of/1993/0101/report.pdf,False
4,False,closed,,False
...,...,...,...,...
2509911,False,closed,,False
2509912,False,closed,,False
2509913,False,closed,,False
2509914,True,green,https://doi.org/10.20944/preprints202412.1402.v1,False


In [69]:
oa.is_oa.sum()

np.int64(1741245)

In [70]:
df.has_fulltext.sum()

np.float64(1177999.0)

In [71]:
oa.any_repository_has_fulltext.sum()

np.int64(0)

In [73]:
boa = pd.json_normalize(df.best_oa_location.apply(lambda x: json.loads(x) if x else None))

In [75]:
boa.head()

Unnamed: 0,id,is_oa,landing_page_url,pdf_url,license,license_id,version,is_accepted,is_published,raw_source_name,...,source.issn,source.is_oa,source.is_in_doaj,source.is_core,source.host_organization,source.host_organization_name,source.host_organization_lineage,source.host_organization_lineage_names,source.type,source
0,pmh:oai:library.ndsu.edu:10365/17688,True,http://hdl.handle.net/10365/17688,http://hdl.handle.net/10365/17688,,,submittedVersion,False,False,NDSU Extension 866,...,,False,False,False,https://openalex.org/I57328836,North Dakota State University,[https://openalex.org/I57328836],[],repository,
1,pmh:oai:digitalcommons.unl.edu:greatplainsquar...,True,http://digitalcommons.unl.edu/greatplainsquart...,http://digitalcommons.unl.edu/cgi/viewcontent....,,,submittedVersion,False,False,Great Plains Quarterly,...,,False,False,False,https://openalex.org/I114395901,University of Nebraska–Lincoln,[https://openalex.org/I114395901],[],repository,
2,,,,,,,,,,,...,,,,,,,,,,
3,doi:10.3133/ofr93101,True,https://doi.org/10.3133/ofr93101,https://pubs.usgs.gov/of/1993/0101/report.pdf,,,publishedVersion,True,True,Open-File Report,...,"[0196-1497, 2331-1258, 2332-4899]",False,False,True,https://openalex.org/P4310316088,United States Department of the Interior,[https://openalex.org/P4310316088],[],journal,
4,,,,,,,,,,,...,,,,,,,,,,


In [79]:
df.best_oa_location.notna().sum()

np.int64(1741245)

In [76]:
boa.is_oa.sum()

1741245

In [85]:
boa.is_oa.dropna().value_counts()

is_oa
True    1741245
Name: count, dtype: int64

In [80]:
boa.columns

Index(['id', 'is_oa', 'landing_page_url', 'pdf_url', 'license', 'license_id',
       'version', 'is_accepted', 'is_published', 'raw_source_name', 'raw_type',
       'source.id', 'source.display_name', 'source.issn_l', 'source.issn',
       'source.is_oa', 'source.is_in_doaj', 'source.is_core',
       'source.host_organization', 'source.host_organization_name',
       'source.host_organization_lineage',
       'source.host_organization_lineage_names', 'source.type', 'source'],
      dtype='object')

In [88]:
(oa.is_oa == boa.is_oa.fillna(False)).mean()

  (oa.is_oa == boa.is_oa.fillna(False)).mean()


np.float64(1.0)

In [90]:
(oa.oa_url == boa.pdf_url).mean()

np.float64(0.5557492760713905)

In [95]:
(oa.oa_url == boa.landing_page_url).mean()

np.float64(0.19706316864787507)

In [91]:
oa.oa_url.notna().sum()

np.int64(1738030)

In [92]:
boa.pdf_url.notna().sum()

np.int64(1394884)

In [94]:
boa.landing_page_url.notna().sum()

np.int64(1714634)

In [106]:
m = (oa.oa_url == boa.pdf_url) | (oa.oa_url == boa.landing_page_url)
oa[~m].oa_url.notna().sum()  # 0 means oa.oa_url is always boa.pdf_url or boa.landing_page when not null

np.int64(0)

In [105]:
m.sum()

np.int64(1738030)

best_oa_location is here exactly when oa.is_oa is True, and then oa.oa_url is always either boa.pdf_url or boa.landing_page_url

In [111]:
oadf = pd.concat([oa[['is_oa', 'oa_status']], boa[['landing_page_url', 'pdf_url']]], axis=1)

In [112]:
oadf

Unnamed: 0,is_oa,oa_status,landing_page_url,pdf_url
0,True,green,http://hdl.handle.net/10365/17688,http://hdl.handle.net/10365/17688
1,True,green,http://digitalcommons.unl.edu/greatplainsquart...,http://digitalcommons.unl.edu/cgi/viewcontent....
2,False,closed,,
3,True,bronze,https://doi.org/10.3133/ofr93101,https://pubs.usgs.gov/of/1993/0101/report.pdf
4,False,closed,,
...,...,...,...,...
2509911,False,closed,,
2509912,False,closed,,
2509913,False,closed,,
2509914,True,green,https://doi.org/10.20944/preprints202412.1402.v1,


In [115]:
oadf.iloc[-2]['landing_page_url']

'https://doi.org/10.20944/preprints202412.1402.v1'

In [118]:
oadf

Unnamed: 0,is_oa,oa_status,landing_page_url,pdf_url
0,True,green,http://hdl.handle.net/10365/17688,http://hdl.handle.net/10365/17688
1,True,green,http://digitalcommons.unl.edu/greatplainsquart...,http://digitalcommons.unl.edu/cgi/viewcontent....
2,False,closed,,
3,True,bronze,https://doi.org/10.3133/ofr93101,https://pubs.usgs.gov/of/1993/0101/report.pdf
4,False,closed,,
...,...,...,...,...
2509911,False,closed,,
2509912,False,closed,,
2509913,False,closed,,
2509914,True,green,https://doi.org/10.20944/preprints202412.1402.v1,


In [122]:
a = df.drop(columns=['open_access', 'best_oa_location', 'created_at'])
final = pd.concat((a.reset_index(drop=True), oadf), axis=1)

In [124]:
final

Unnamed: 0,id,doi,title,abstract,language,publication_date,type,has_fulltext,fwci,is_oa,oa_status,landing_page_url,pdf_url
0,W1000221716,,Integrated management of leafy spurge,Figure 1. Leafy spurge plant in flowering grow...,en,1995-01-01,article,1.0,1.44102389,True,green,http://hdl.handle.net/10365/17688,http://hdl.handle.net/10365/17688
1,W100006350,,Farm Consolidation in the Northern and Central...,"During the past half century, American agricul...",en,1987-01-01,article,1.0,0.0,True,green,http://digitalcommons.unl.edu/greatplainsquart...,http://digitalcommons.unl.edu/cgi/viewcontent....
2,W1000233381,https://doi.org/10.12736/issn.2300-3022.2013212,The lower Vistula in the aspect of the E40 and...,Throughout the history of Europe and the world...,en,2013-06-03,article,0.0,0.48269519,False,closed,,
3,W1000028557,https://doi.org/10.3133/ofr93101,"Occurrence of pesticides, nitrite plus nitrate...","During 1988, pesticides were detected in 4 of ...",en,1993-01-01,article,1.0,0.0,True,bronze,https://doi.org/10.3133/ofr93101,https://pubs.usgs.gov/of/1993/0101/report.pdf
4,W1000134164,https://doi.org/10.3133/sir20055079,Feasibility of using benthic invertebrates as ...,Macroinvertebrates were collected from 19 site...,en,2005-01-01,article,0.0,0.0,False,closed,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2509911,W2112656034,https://doi.org/10.7325/galemys.2012.a05,Habitat suitability model for red deer (Cervus...,Monitoring population trends is essential in w...,en,2012-12-31,article,0.0,0.2588458,False,closed,,
2509912,W2159214726,,Heavy metal concentrations in the topsoils of ...,"In the Dutch part of the area, heavy metal con...",en,2007-01-01,article,0.0,0.0,False,closed,,
2509913,W266132455,,Determination of the upper threshold value for...,Using reclaimed water for greenbelt irrigation...,en,2009-01-01,article,0.0,0.0,False,closed,,
2509914,W4405528638,https://doi.org/10.20944/preprints202412.1402.v1,UV Map Nowcasting and Comparison with Ground-B...,This study introduces a new method for nowcast...,en,2024-12-18,preprint,0.0,0.0,True,green,https://doi.org/10.20944/preprints202412.1402.v1,


In [126]:
final.to_parquet('screened_alldata.parquet', index=False)

In [130]:
s3.upload_file(
        Filename="screened_alldata.parquet",
        Bucket="sufficiency-library",
        Key="library_v1_2025-12-08.parquet",
    )