In [None]:
import polars as pl
import plotly.express as px

ops = dict(
    region_name="ru-central1",
    endpoint_url="https://s3.yandexcloud.net",
    service_name="s3",
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/citations/*.parquet", storage_options=ops
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/unique_authors/*.parquet", storage_options=ops
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/unique_works/*.parquet", storage_options=ops
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/unique_teams/*.parquet", storage_options=ops
)

In [13]:
df = pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/works_by_concept/*.parquet",
    storage_options=ops,
)

df_filtered = df.filter(pl.col("works") > 75000)

fig = px.pie(
    df_filtered.to_pandas(),
    names="concept",
    values="works",
    title="Works Distribution by concepts covered",
)

fig.add_annotation(
    text="Concepts with less than 75k publications are dropped",
    x=0.5,
    y=-0.1,
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=12),
)

fig.write_image("works_by_concept.png", width=800, height=640)

In [3]:
df = pl.read_parquet(
        "s3://openalex/rich-sample-1-stats/works_by_country/*.parquet",
        storage_options=ops,
)

df_filtered = df.with_columns(
    pl.when(pl.col("works") < 50_000)
    .then(pl.lit("other"))
    .otherwise(pl.col("country"))
    .alias("country_grouped")
).group_by("country_grouped").agg(pl.sum("works").alias("works"))

fig = px.pie(
    df_filtered.to_pandas(),
    names="country_grouped",
    values="works",
    title="Works Distribution by Country",
)


fig.add_annotation(
    text="Countries with less than 50k publications are merged as \"other\".",
    x=0.5,
    y=-0.1,
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=12),
)

fig.write_image("works_by_country.png", width=800, height=640)

In [None]:
df = pl.read_parquet(
        "s3://openalex/rich-sample-1-stats/works_by_country/*.parquet",
        storage_options=ops,
).sort("works", descending=True).head(20)

df