In [1]:
import polars as pl
from lets_plot import *

In [2]:
LetsPlot.setup_html(no_js=True)

In [3]:
years = range(1880, 2011)
bb = []
for year in years:
    dta = pl.read_csv(f"./data/yob{year}.csv", has_header=False)
    dta = dta.with_columns(year=year)
    bb.append(dta)

babynames = pl.concat(bb)
babynames = babynames.rename(
    {"column_1": "name", "column_2": "sex", "column_3": "births"}
)

In [4]:
babynames.shape

(1690784, 4)

In [5]:
babynames.head()

name,sex,births,year
str,str,i64,i32
"""Mary""","""F""",7065,1880
"""Anna""","""F""",2604,1880
"""Emma""","""F""",2003,1880
"""Elizabeth""","""F""",1939,1880
"""Minnie""","""F""",1746,1880


### Total births by sex and year

In [6]:
total_births = babynames.group_by(["year", "sex"]).agg(
    count_births=pl.col("births").sum()
)

In [7]:
(
    ggplot(total_births, aes(x="year", y="count_births", color="sex"))
    + geom_line(size=1)
    + scale_y_continuous(name="")
    + scale_x_continuous(format="0.0f")
    + scale_color_manual(values=["darkgreen", "orange"])
    + labs(title="Total births by sex and year")
)

* fraction of babies given each name relative to the total number of births

In [8]:
babynames = babynames.with_columns(
    prop=(pl.col("births") / pl.col("births").sum()).over(["year", "sex"])
)

In [9]:
babynames.head()

name,sex,births,year,prop
str,str,i64,i32,f64
"""Mary""","""F""",7065,1880,0.077643
"""Anna""","""F""",2604,1880,0.028618
"""Emma""","""F""",2003,1880,0.022013
"""Elizabeth""","""F""",1939,1880,0.021309
"""Minnie""","""F""",1746,1880,0.019188


In [20]:
top1000 = (
    babynames.with_columns(rn=pl.col("births").rank(descending=True).over(["year", "sex"]))
    .filter(pl.col("rn") <= 1000)
    .drop("rn")
)

In [21]:
top1000.head()

name,sex,births,year,prop
str,str,i64,i32,f64
"""Mary""","""F""",7065,1880,0.077643
"""Anna""","""F""",2604,1880,0.028618
"""Emma""","""F""",2003,1880,0.022013
"""Elizabeth""","""F""",1939,1880,0.021309
"""Minnie""","""F""",1746,1880,0.019188


In [22]:
total_births_top1000 = (
    top1000.group_by(["year", "name"])
    .agg(
        total_births=pl.col("births").sum()
    )
)

In [23]:
selected_names = total_births_top1000.filter(pl.col("name").is_in(["John", "Harry", "Mary", "Marilyn"]))

In [39]:
(
    ggplot(selected_names, aes(x="year", y="total_births", color="name"))
    + geom_line(show_legend=False, size=1)
    + scale_x_continuous(format="0.0f")
    + facet_wrap(facets="name", scales="free_y", ncol=1)
    + ggsize(600, 800)
    + labs(title="Number of births each year", y="")
)

* Measuring the increase in naming diversity