In [1]:
import os
import polars as pl
import plotly.express as px
from dotenv import load_dotenv

load_dotenv()


ops = dict(
    region_name="ru-central1",
    endpoint_url="https://s3.yandexcloud.net",
    aws_access_key_id=os.getenv("ACCESS_KEY") or "",
    aws_secret_access_key=os.getenv("SECRET_KEY") or "",
    service_name="s3",
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/citations/*.parquet", storage_options=ops
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/unique_authors/*.parquet", storage_options=ops
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/unique_works/*.parquet", storage_options=ops
)

In [None]:
pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/unique_teams/*.parquet", storage_options=ops
)

In [None]:
df = pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/works_by_concept/*.parquet",
    storage_options=ops,
)

df_filtered = df.filter(pl.col("works") > 75000)

fig = px.pie(
    df_filtered.to_pandas(),
    names="concept",
    values="works",
    title="Works Distribution by concepts covered",
)

fig.add_annotation(
    text="Concepts with less than 75k publications are dropped",
    x=0.5,
    y=-0.1,
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=12),
)

fig.write_image("works_by_concept.png", width=800, height=640)

In [None]:
df = pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/works_by_country/*.parquet",
    storage_options=ops,
)

df_filtered = (
    df.with_columns(
        pl.when(pl.col("works") < 50_000)
        .then(pl.lit("other"))
        .otherwise(pl.col("country"))
        .alias("country_grouped")
    )
    .group_by("country_grouped")
    .agg(pl.sum("works").alias("works"))
)

fig = px.pie(
    df_filtered.to_pandas(),
    names="country_grouped",
    values="works",
    title="Works Distribution by Country",
)


fig.add_annotation(
    text='Countries with less than 50k publications are merged as "other".',
    x=0.5,
    y=-0.1,
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=12),
)

fig.write_image("works_by_country.png", width=800, height=640)

In [None]:
df = (
    pl.read_parquet(
        "s3://openalex/rich-sample-1-stats/works_by_country/*.parquet",
        storage_options=ops,
    )
    .sort("works", descending=True)
    .head(20)
)

df

In [None]:
df = pl.read_parquet(
    "s3://openalex/rich-sample-1-stats/citations_distribution/*.parquet",
    storage_options=ops,
).select("citation_count")

In [None]:
px.histogram(
    df.to_pandas(),
    x="citation_count",
    marginal="box",
    range_x=[-5, 50],
).update_layout(
    title="Citation Counts",
    xaxis_title="Citations",
    yaxis_title="Count",
).write_image("citation_distribution.png", width=800, height=640)

# 100k edges samples

In [34]:
def save_graph(df):
    df.select(
        pl.col("work").alias("target"),
        pl.col("author").alias("source"),
    ).write_csv("graph.csv")

    pl.concat(
        [
            df.select(
                pl.col("work").unique().alias("id"), pl.lit("#ffa630").alias("color")
            ),
            df.select(
                pl.col("author").unique().alias("id"), pl.lit("#0474BA").alias("color")
            ),
        ]
    ).write_csv("metadata.csv")

In [37]:
df = (
    pl.scan_parquet(
        "s3://openalex/rich-sample-1-stats/graph/edges/*.parquet",
        storage_options=ops,
    )
    .collect()
    .sample(100_000)
)

save_graph(df)

# 5k works sample

In [36]:
import networkx as nx

df = (
    pl.scan_parquet(
        "s3://openalex/rich-sample-1-stats/graph/edges/*.parquet",
        storage_options=ops,
    )
    .select("author", "work")
    .group_by("work")
    .agg(pl.col("author"))
    .collect()
    .sample(5000)
    .explode(pl.col("author"))
)

save_graph(df)

In [None]:
import networkx as nx

df = (
    pl.scan_parquet(
        "s3://openalex/rich-sample-1-stats/graph/edges/*.parquet",
        storage_options=ops,
    )
    .select("author", "work")
    .group_by("work")
    .agg(pl.col("author"))
    .collect()
    .sample(5000)
    .explode(pl.col("author"))
)

save_graph(df)

# 5k most contributing authors

In [41]:
df = (
    pl.scan_parquet(
        "s3://openalex/rich-sample-1-stats/graph/edges/*.parquet",
        storage_options=ops,
    )
    .select("author", "work")
    .group_by("author")
    .agg(pl.col("work"))
    .sort(pl.col("work").list.len(), descending=True)
    .filter(pl.col("work").list.len() > 2)
    .limit(5_000)
    .explode(pl.col("work")) 
    .collect()
)

save_graph(df)

In [56]:
pl.DataFrame(
[
    ["United States", 947_373, 340.1],
    ["China", 589_904, 1416.1],
    ["France", 247_986, 66.6],
    ["Japan", 220_210, 123.1],
    ["Germany", 208_337, 84.0],
    ["Great Britain", 198_686, 69.6],
    ["Italy", 140_737, 59.1],
    ["Canada", 106_002, 40.1],
    ["Spain", 99_589, 47.9],
    ["India", 97_118, 1_438, 1463.9],
    ["Russia", 61_151, 144.0],
],
    schema=["Country", "Number of papers", "Population (mln)"],
    orient="row",
).with_columns(
    (pl.col("Number of papers") / (pl.col("Population (mln)") * 1_000_000) * 1_000).alias("Papers per person (1e-3)")
).sort("Papers per person (1e-3)", descending=True).to_pandas()

Unnamed: 0,Country,Number of papers,Population (mln),Papers per person (1e-3)
0,France,247986,66.6,3.723514
1,Great Britain,198686,69.6,2.854684
2,United States,947373,340.1,2.785572
3,Canada,106002,40.1,2.643441
4,Germany,208337,84.0,2.480202
5,Italy,140737,59.1,2.381337
6,Spain,99589,47.9,2.079102
7,Japan,220210,123.1,1.788871
8,Russia,61151,144.0,0.42466
9,China,589904,1416.1,0.416569
