In [55]:
import json
import os

import numpy as np
import pandas as pd
import polars as pl
from dotenv import load_dotenv
from tqdm import tqdm

In [7]:
load_dotenv()

True

In [40]:
def parse_id(col):
    return col.str[22:].astype(np.uint64)

In [1]:
def get_best_q_by_year():
    return pl.read_csv("s3://tmp-borza-public-cyx/metascience/q-by-year.csv.gz")

In [None]:
source_base = load_sources()

In [4]:
qdf = get_best_q_by_year()

In [49]:
sodf = (
    pd.read_csv(os.environ["OA_ROOT"] + "/entity-csvs/sources/ids.csv.gz")
    .assign(id=lambda df: df["openalex"].pipe(parse_id))
    .set_index("id")
)

In [60]:
puby = "publication_year"

In [50]:
_isc = "issn"
_issns = pd.concat(
    [
        sodf[_isc].dropna().apply(json.loads).explode().reset_index(),
        sodf["issn_l"].dropna().rename(_isc).reset_index(),
    ]
).drop_duplicates()

In [51]:
_issns

Unnamed: 0,id,issn
0,157242733,1876-214X
1,2765070212,1176-306X
2,2765070212,2324-3740
3,2764771425,0045-7736
4,2736747913,2442-8868
...,...,...
124339,2764951900,0387-1185
124567,4210232958,2297-2633
128100,4306512670,1025-3076
132798,4210212177,1767-4603


In [52]:
q_matched_df = (
    get_best_q_by_year()
    .select(
        [
            pl.col(_isc),
            pl.col("year").cast(pl.UInt16).alias(puby),
            pl.col("best_q").str.slice(1, None).cast(pl.UInt8),
        ]
    )
    .join(pl.from_pandas(_issns).select(["id", pl.col(_isc)]), on=_isc)
    .drop(_isc)
)

In [None]:
w_dfs = []
for wdf in tqdm(
    pd.read_csv(
        os.environ["OA_ROOT"] + "/entity-csvs/works/main.csv.gz",
        chunksize=1_000_000,
        usecols=["id", puby],
    )
):
    w_dfs.append(
        pl.from_pandas(
            wdf.dropna().assign(id=lambda df: df["id"].pipe(parse_id)),
            schema_overrides={puby: pl.UInt16},
        )
    )

100it [03:11,  1.98s/it]

In [None]:
full_ywdf = pl.concat(w_dfs)

In [57]:
for lodf in tqdm(
    pd.read_csv(
        os.environ["OA_ROOT"] + "/entity-csvs/works/locations.csv.gz",
        chunksize=100_000,
        usecols=["parent_id", "source"],
    )
):
    break

0it [00:00, ?it/s]


In [48]:
lodf.dropna().apply(parse_id)

Unnamed: 0,parent_id,source
10,2027675404,4210224251
11,2027678827,186920367
12,2027678827,4306525036
13,2027686561,154037165
14,2027694283,4210203914
...,...,...
99995,3009967627,4210220205
99996,3009992470,59624048
99997,3009992470,4306525036
99998,3010000965,133005937


In [41]:
_issns["openalex"].pipe(parse_id)

0          157242733
1         2765070212
2         2765070212
3         2764771425
4         2736747913
             ...    
124339    2764951900
124567    4210232958
128100    4306512670
132798    4210212177
133258    4220651409
Name: openalex, Length: 210838, dtype: uint64

In [31]:
q_matched_df

year,best_q,openalex
u16,u8,str
1950,3,"""https://openal…"
1950,3,"""https://openal…"
1950,1,"""https://openal…"
1950,1,"""https://openal…"
1950,4,"""https://openal…"
1950,4,"""https://openal…"
1950,1,"""https://openal…"
1950,1,"""https://openal…"
1950,1,"""https://openal…"
1950,1,"""https://openal…"
