In [1]:
from loguru import logger
import psycopg
from psycopg import Connection
from psycopg.sql import SQL
import tomli
from typing import Dict, Optional, Generator, List, TypedDict, TypeVar, Iterable, Callable, Any, Sequence
from pydantic import BaseModel
from pathlib import Path
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display
import os

CONFIG_PATH = "../database/config.toml"

class DatabaseConfig(BaseModel):
    dbname: str
    user: str
    password: Optional[str]

class Config(BaseModel):
    database: DatabaseConfig

In [2]:
def to_kv_str(d: Dict[str, str]) -> str:
    """Convert dictionary to key-value string"""
    return " ".join(f"{k}={v}" for k, v in d.items())

def postgres_env_password() -> Optional[str]:
    """Get password from environment variable"""
    return os.environ.get("PGPASSWORD")

In [3]:
config_dict = {}
with open(Path(CONFIG_PATH), "rb") as f:
    config_dict = tomli.load(f)
config_obj = Config(**config_dict)
if not config_obj.database.password:
    config_obj.database.password = postgres_env_password()
# https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING
conn_info = to_kv_str(config_obj.database.model_dump())

In [4]:
artist_post_count_query = """--sql
SELECT t.name    as tag_name,
       post_count
FROM booru.artists
         INNER JOIN booru.artist_tags_assoc ata on artists.id = ata.artist_id
         INNER JOIN booru.tags t on ata.tag_id = t.id
         -- INNER JOIN booru.tag_post_counts tpc on t.id = tpc.tag_id
         INNER JOIN booru.view_artist_tag_no_comic tpc on t.id = tpc.tag_id
GROUP BY t.id, artist_id, t.name, post_count;
"""

In [5]:
def get_df_by_sql(sql: str) -> pl.DataFrame:
    """Get dataframe by SQL"""
    with psycopg.connect(conninfo=conn_info) as conn:
        with conn.cursor() as cur:
            cur.execute(sql)
            rows = cur.fetchall()
            assert cur.description is not None
            column_names = [desc[0] for desc in cur.description]
            return pl.DataFrame(rows, schema=column_names)

In [6]:
artist_post_df = get_df_by_sql(artist_post_count_query)
artist_post_df.describe()

describe,tag_name,post_count
str,str,f64
"""count""","""336921""",336921.0
"""null_count""","""0""",0.0
"""mean""",,19.785715
"""std""",,78.637323
"""min""","""!?_(krkrgzgz)""",1.0
"""max""","""zzzzzzzzzzzzzz...",5424.0
"""median""",,3.0


In [7]:
post_count_less_n = artist_post_df.filter(
  pl.col("post_count") < 50)
fig = px.histogram(post_count_less_n, x="post_count")
fig.update_layout(
    title="Post Count Distribution (Post Count < 100)",
    xaxis_title="Post Count",
    yaxis_title="Number of Artists",
)
fig.show()

In [8]:
post_count_less_n = artist_post_df.filter(
  pl.col("post_count") > 50).filter(pl.col("post_count") < 1000)
fig = px.histogram(post_count_less_n, x="post_count")
fig.update_layout(
    title="Post Count Distribution (Post Count < 100)",
    xaxis_title="Post Count",
    yaxis_title="Number of Artists",
)
fig.show()

In [9]:
post_count_less_n = artist_post_df.filter(
  pl.col("post_count") > 1000).filter(pl.col("post_count") < 6000)
fig = px.histogram(post_count_less_n, x="post_count")
fig.update_layout(
    title="Post Count Distribution (Post Count < 100)",
    xaxis_title="Post Count",
    yaxis_title="Number of Artists",
)
fig.show()

In [31]:
artists_name = [
    "hiten_(hitenkei)",
    "hagi_(ame_hagi)",
    "shion_(mirudakemann)",
    "as109",
    "toosaka_asagi",
    "kantoku",
    "niliu_chahui",
    "atdan",
    "himitsu_(hi_mi_tsu_2)",
    "lm7_(op-center)"
]

artist_name = artists_name[-1]
artist_posts_query = f"""--sql
WITH a_tag AS (SELECT booru.get_tag_id_by_artist_name('{artist_name}') AS id)
SELECT p.id as post_id, p.created_at, p.fav_count, p.score, p.width, p.height
FROM booru.posts_tags_assoc pta
         JOIN booru.posts p ON pta.post_id = p.id
         JOIN a_tag ON pta.tag_id = a_tag.id;
"""

artist_posts_df = get_df_by_sql(artist_posts_query)
artist_posts_df.describe()

describe,post_id,created_at,fav_count,score,width,height
str,f64,str,f64,f64,f64,f64
"""count""",436.0,"""436""",436.0,436.0,436.0,436.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,0.0
"""mean""",2566700.0,,27.738532,17.318807,1680.245413,1735.786697
"""std""",1453800.0,,18.731386,13.80527,1102.820689,1321.3025
"""min""",411561.0,"""2009-02-27 21:...",1.0,0.0,560.0,492.0
"""max""",6636437.0,"""2023-08-29 21:...",129.0,115.0,8000.0,8488.0
"""median""",2650989.0,,23.0,15.0,1400.0,1274.0


In [32]:
artist_df_with_year = artist_posts_df.with_columns([pl.col("created_at").dt.year().alias("year")])
fig = px.histogram(artist_df_with_year, x="year")
fig.update_layout(
    title=f"{artist_name} Post Count by Year",
    xaxis_title="Year",
    yaxis_title="Number of Posts",
)
fig.show()

In [33]:
ar_buckets = [
    (9/21, "9/21"),
    (9/16, "9/16"),
    (3/4, "3/4"),
    (1, "1"),
    (4/3, "4/3"),
    (16/9, "16/9"),
    (21/9, "21/9"),
]

ar_buckets_ranges = [
    "(-inf, 9/21]",
    "(9/21, 9/16]",
    "(9/16, 3/4]",
    "(3/4, 1)",
    "1",
    "(1, 4/3]",
    "(4/3, 16/9]",
    "(16/9, 21/9]",
    "(21/9, inf)",
]
artist_df_with_ar = artist_posts_df.with_columns(
    [(pl.col("width") / pl.col("height")).alias("aspect_ratio")]
)
artist_df_with_ar = artist_df_with_ar.with_columns(
    [
        pl.when(pl.col("aspect_ratio") <= ar_buckets[0][0])
        .then(ar_buckets_ranges[0])
        .when((pl.col("aspect_ratio") > ar_buckets[0][0]) & (pl.col("aspect_ratio") <= ar_buckets[1][0]))
        .then(ar_buckets_ranges[1])
        .when((pl.col("aspect_ratio") > ar_buckets[1][0]) & (pl.col("aspect_ratio") <= ar_buckets[2][0]))
        .then(ar_buckets_ranges[2])
        .when((pl.col("aspect_ratio") > ar_buckets[2][0]) & (pl.col("aspect_ratio") < ar_buckets[3][0]))
        .then(ar_buckets_ranges[3])
        .when((pl.col("aspect_ratio") == 1))
        .then(ar_buckets_ranges[4])
        .when((pl.col("aspect_ratio") > ar_buckets[3][0]) & (pl.col("aspect_ratio") <= ar_buckets[4][0]))
        .then(ar_buckets_ranges[5])
        .when((pl.col("aspect_ratio") > ar_buckets[4][0]) & (pl.col("aspect_ratio") <= ar_buckets[5][0]))
        .then(ar_buckets_ranges[6])
        .when((pl.col("aspect_ratio") > ar_buckets[5][0]) & (pl.col("aspect_ratio") <= ar_buckets[6][0]))
        .then(ar_buckets_ranges[7])
        .otherwise(ar_buckets_ranges[8])
        .alias("aspect_ratio_bucket")
    ]
)

fig = px.histogram(artist_df_with_ar, x="aspect_ratio_bucket", 
                   category_orders={"aspect_ratio_bucket": ar_buckets_ranges})
fig.update_layout(
    title=f"{artist_name} Post Count by Aspect Ratio",
    xaxis_title="Aspect Ratio",
    yaxis_title="Number of Posts",
)