## Connect to database

In [2]:
%matplotlib widget
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import psycopg2
from scipy.signal import find_peaks

db = psycopg2.connect(dbname=os.environ['DB_NAME'],
                      user=os.environ['DB_USER'],
                      password=os.environ['DB_PASSWORD'],
                      host=os.environ['DB_URL'],
                      port=5432)

## Top names by year

In [45]:
# query params
country = 'us'
sex = 'm'
min_year = 1925
max_year = 1940
sort_by_year=1932
dmeta = '.*j|l|m|n.*'

query = f'''
select
  y.year,
  n.name,
  sum(br.births) as number_births,
  sum(br.births) / (y.births_{country.lower()}_{sex.lower()}::float / 1000) as births_per_k
from public.birth_record br
inner join public.name n on n.id = br.name_id
inner join public.year y on y.year = br.year_id
where
  br.sex = '{sex.upper()}' and
  y.year between {int(min_year)} and {int(max_year) if max_year else 2019} and
  br.country = '{country.lower()}' and
  n.dmeta ~* '{dmeta}'
group by 1, 2
having sum(br.births) > 500
;'''

df = pd.read_sql_query(query, db)
df = df.round({'births_per_k': 2})
df = df.pivot(index='name', columns='year', values='births_per_k')
df = df.sort_values(by=[sort_by_year], ascending=False)

df.head(50)

year,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
James,45.75,46.45,46.19,46.23,47.06,47.75,47.88,48.33,50.09,51.09,51.43,51.38,51.71,52.0,52.62,52.67
John,49.68,48.98,48.16,47.1,47.05,46.42,45.27,45.25,45.31,44.01,44.41,44.79,44.57,44.45,45.02,46.18
William,46.29,45.33,44.31,43.04,43.16,41.84,40.46,39.2,38.28,37.83,37.59,37.01,37.23,37.33,37.13,37.74
Charles,25.69,25.77,27.49,27.38,27.02,28.22,27.97,28.94,28.42,28.06,28.03,27.33,27.91,27.4,27.17,26.71
Donald,16.67,18.26,20.52,22.52,24.54,25.72,27.21,27.72,27.88,28.64,27.73,26.91,25.48,23.46,21.23,19.48
George,23.07,22.61,22.33,21.38,20.78,20.17,19.34,19.02,18.0,17.42,17.35,17.1,16.85,16.22,15.78,15.44
Joseph,21.38,20.74,20.23,19.55,19.11,18.57,18.07,17.28,16.82,16.11,16.07,16.4,15.84,15.61,15.63,15.35
Thomas,14.49,14.49,14.65,14.53,14.7,15.06,15.75,15.67,16.17,16.36,16.53,17.23,18.08,18.81,19.53,20.22
Paul,12.31,11.92,11.71,11.53,11.22,11.47,11.45,11.23,11.62,11.49,11.74,11.56,11.78,11.75,11.77,11.89
Kenneth,8.81,9.03,9.34,9.79,9.79,10.36,10.81,10.93,11.18,11.11,11.28,11.73,12.08,12.67,12.98,12.76


## Trend over time

In [54]:
country = 'us'
sex = 'f'
min_year = 1930
max_year = None

names = {
    'm': [
        # 'liam', # most popular current name
        # 'noah', # most popular current name
        'asa',
        'brooks',
        'gray',
        'jones',
        'julian',
        'reed',
        # 'bruce',
        # 'charlie',
        # 'elliot',
        # 'ellis',
        # 'emmett',
        # 'finn',
        # 'grady',
        # 'griffin',
        # 'henry',
        # 'lincoln',
        # 'maddox',
        # 'nash',
        # 'oliver',
        # 'oscar',
        # 'owen',
        # 'ronan',
        # 'rowan',
    ],
    'f': [
        # 'emma', # most popular current name
        # 'olivia', # most popular current name
        'cassidy',
        'elsie',
        'josie',
        'june',
        'lily',
        'quinn',
        # 'bailey',
        # 'charlotte',
        # 'collette',
        # 'devi'
        # 'esme',
        # 'grace',
        # 'harper',
        # 'iris',
        # 'lane',
        # 'lou',
        # 'luna',
        # 'maia',
        # 'nelle',
        # 'piper',
        # 'rose',
        # 'ruby',
        # 'ruth',
        # 'sadie',
        # 'stevie',
    ]
}

query = f'''
select
  y.year,
  n.name,
  sum(br.births) as births,
  sum(br.births) / (y.births_{country.lower()}_{sex.lower()}::float / 1000) as births_per_k
from public.birth_record br
inner join public.name n on n.id = br.name_id
inner join public.year y on y.year = br.year_id
where
  br.country = '{country.lower()}' and
  br.sex = '{sex.upper()}' and
  y.year between {int(min_year)} and {int(max_year) if max_year else 2019} and
  n.name in ({', '.join([f"'{n.title()}'" for n in names[sex]])})
group by 1, 2
having sum(br.births) > 50
;'''

df = pd.read_sql_query(query, db)
df = df.round({'births_per_k': 2})
df = df.pivot(index='year', columns='name', values='births_per_k')
df = df.sort_values(by=['year'], ascending=True)
maxes = df.agg(np.max).sort_values(ascending=False).index
df = df.loc[:, maxes]

title = f'Births per 1000, by year ({country.upper()}, {sex.upper()})'
max_series_per_plot = 8
num_cols = len(df.columns)
chunks = np.ceil(num_cols / max_series_per_plot)

if chunks == 1.0:
    fig, ax = plt.subplots()
    ax.get_xaxis().set_visible(False)
    df.plot.line(ax=ax, figsize=(10, 4))
    
elif chunks > 1.0:
    chunk_size = int(np.ceil(num_cols / chunks))
    i = 0
    ax = 0
    fig, axes = plt.subplots(nrows=int(chunks), ncols=1, figsize=(10, 4 * chunks))
 
    while i < num_cols:
        j = (i + chunk_size) if i + chunk_size < num_cols else None
        df.iloc[:, slice(i, j, 1)].plot.line(ax=axes[ax])
        axes[ax].legend(loc='upper left')
        i = i + chunk_size
        ax = ax + 1

fig.suptitle(title)
fig.canvas.toolbar_visible = False
fig.canvas.header_visible = False
fig.canvas.footer_visible = False
        
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Find peaks

In [7]:
country = 'us'
sex = 'f'
min_year = 1930
max_year = None
dmeta = '.*j|l|m|n.*'

query = f'''
select
  y.year,
  n.name,
  sum(br.births) as births,
  sum(br.births) / (y.births_{country.lower()}_{sex.lower()}::float / 1000) as births_per_k
from public.birth_record br
inner join public.name n on n.id = br.name_id
inner join public.year y on y.year = br.year_id
where
  br.country = '{country.lower()}' and
  br.sex = '{sex.upper()}' and
  y.year between {int(min_year)} and {int(max_year) if max_year else 2019} and
  n.dmeta ~* ('{dmeta}')
group by 1, 2
having sum(br.births) > 50
;'''

df = pd.read_sql_query(query, db)
df = df.round({'births_per_k': 2})
df = df.pivot(index='year', columns='name', values='births_per_k')
df = df.sort_values(by=['year'], ascending=True)
maxes = df.agg(np.max).sort_values(ascending=False).index
df = df.loc[:, maxes]
df = df.iloc[:, slice(0, 100, 1)]
df.head(50)

name,Linda,Mary,Jennifer,Shirley,Lisa,Jessica,Ashley,Susan,Carol,Amanda,...,Charlotte,Alyssa,Amelia,Phyllis,Michele,Wendy,Andrea,Brianna,April,Bonnie
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1930,0.42,54.99,,12.67,,,,0.86,3.94,0.17,...,2.92,,0.58,6.66,,,0.12,,,2.8
1931,0.48,54.63,,13.05,,,,0.91,4.43,0.19,...,3.08,,0.54,6.53,,,0.08,,,3.22
1932,0.7,54.12,,13.09,,,,1.11,5.94,0.19,...,3.04,,0.58,6.61,,,0.11,,,3.27
1933,0.75,53.07,,13.69,,,,1.21,7.86,0.19,...,2.98,,0.58,6.8,,,0.15,,,3.22
1934,0.92,52.6,,21.1,,,,1.38,9.49,0.17,...,3.17,,0.51,6.63,,,0.12,,,3.47
1935,1.1,50.67,,38.98,,,,1.66,10.79,0.2,...,3.19,,0.56,6.78,,,0.12,,,3.32
1936,2.26,50.46,,32.63,,,,2.0,13.48,0.18,...,3.3,,0.54,6.84,,0.06,0.14,,,3.13
1937,3.98,50.5,,24.34,0.05,0.07,,2.4,15.73,0.16,...,3.21,,0.59,6.57,,0.07,0.16,,,3.26
1938,6.17,49.25,0.04,20.82,0.07,,,3.11,17.02,0.17,...,3.18,,0.5,6.7,0.05,0.11,0.37,,,3.3
1939,9.45,48.41,0.05,18.03,0.07,0.07,,4.31,17.78,0.16,...,3.3,,0.43,6.69,0.05,0.2,0.57,,0.05,3.43


In [29]:
df2 = df.iloc[:, 4]
peaks = find_peaks(df2, prominence=0.05, distance=15, height=np.percentile(df2[df2.notnull()], 66))
df3 = df2.iloc[peaks[0]]
df3

year
1965    32.98
Name: Lisa, dtype: float64

In [None]:
def _find_peaks():
    

df.apply(_find_peaks)