# IGR2023 - BABY VISUALISATION <br>Theme 3: Gender

In [1]:
import pandas as pd
import numpy as np

import altair as alt
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
alt.data_transformers.enable('json') # Let Altair/Vega-Lite work with large data sets

from itertools import product
import ipywidgets as widgets
from IPython.display import display, clear_output

ModuleNotFoundError: No module named 'geopandas'

In [None]:
# Load the data
names = pd.read_csv("../data/dpt2020.csv", sep=";")
names.rename(columns={'annais': 'year', 'nombre': 'count', 'sexe': 'gender'}, inplace=True)
pb_years = names[names.year == 'XXXX']
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
# names.drop(names[names.dpt.isin(['XX', '971', '972', '973', '974'])].index, inplace=True)
names.drop(names[names.year == 'XXXX'].index, inplace=True)
names.year = names.year.astype(int)
names.gender = names.gender.astype(int)

# Group data on all of France
births = names[['year','gender','count']].groupby(['year','gender']).sum().reset_index()
births = births.rename(columns={'count': 'births'})
births_m = births[ births.gender == 1 ]
births_f = births[ births.gender == 2 ]
names_france = names.groupby(['gender', 'preusuel', 'year'])['count'].sum().reset_index()
names_france_m = names_france[names_france.gender == 1]
names_france_f = names_france[names_france.gender == 2]

# 0/ Explore the data

In [None]:

names.head()

Unnamed: 0,gender,preusuel,year,dpt,count
10885,1,AADIL,1983,84,3
10886,1,AADIL,1992,92,3
10888,1,AAHIL,2016,95,3
10892,1,AARON,1962,75,3
10893,1,AARON,1976,75,3


In [None]:
names_france.head()

Unnamed: 0,gender,preusuel,year,count
0,1,AADIL,1983,3
1,1,AADIL,1992,3
2,1,AAHIL,2016,3
3,1,AARON,1962,3
4,1,AARON,1976,3


In [None]:
print(names.describe())
print(names.info())
print(names_france.describe())
print(names_france.info())

             gender          year         count
count  3.668274e+06  3.668274e+06  3.668274e+06
mean   1.536004e+00  1.973444e+03  2.079637e+01
std    4.987021e-01  3.393711e+01  5.648752e+01
min    1.000000e+00  1.900000e+03  3.000000e+00
25%    1.000000e+00  1.948000e+03  4.000000e+00
50%    2.000000e+00  1.980000e+03  7.000000e+00
75%    2.000000e+00  2.003000e+03  1.800000e+01
max    2.000000e+00  2.020000e+03  6.310000e+03
<class 'pandas.core.frame.DataFrame'>
Index: 3668274 entries, 10885 to 3727550
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   gender    int32 
 1   preusuel  object
 2   year      int32 
 3   dpt       object
 4   count     int64 
dtypes: int32(2), int64(1), object(2)
memory usage: 139.9+ MB
None
              gender           year          count
count  257346.000000  257346.000000  257346.000000
mean        1.535959    1975.738683     296.436630
std         0.498706      34.159966    1372.815209
min         1.000000    1900.000

In [None]:
births

Unnamed: 0,year,gender,births
0,1900,1,167115
1,1900,2,223390
2,1901,1,185404
3,1901,2,243254
4,1902,1,193775
...,...,...,...
237,2018,2,239049
238,2019,1,267049
239,2019,2,236676
240,2020,1,258143


In [None]:
print(f"{len(pb_years):,} rows with year='XXXX'")
print(f"{len(pb_years['preusuel'].unique()):,} unique names in pb_years")
undated_births = pb_years['count'].sum()
print(f'{undated_births:,} births in non identified years ({100*undated_births/(names_france["count"].sum()+ undated_births):.2f}%)')

births_m_f = births_m.merge(births_f, on='year', suffixes=('_m', '_f'))
births_m_f

37,244 rows with year='XXXX'
35,011 unique names in pb_years
8,667,245 births in non identified years (10.20%)


Unnamed: 0,year,gender_m,births_m,gender_f,births_f
0,1900,1,167115,2,223390
1,1901,1,185404,2,243254
2,1902,1,193775,2,247075
3,1903,1,196698,2,246930
4,1904,1,203344,2,250371
...,...,...,...,...,...
116,2016,1,284287,2,250625
117,2017,1,276384,2,244244
118,2018,1,271621,2,239049
119,2019,1,267049,2,236676


# 1/ Functions to get top names and corresponding data

In [None]:
# select the top 10 female and male names for a given period

def get_names_top(names,start, end, top):
    names_period = names[names.year.between(start, end)]
    top_count = names_period.groupby(['preusuel'])['count'].sum().reset_index()
    top_count = top_count.sort_values(by='count', ascending=False).head(top)
    top_list = top_count['preusuel'].tolist()
    top_count = top_count.reset_index(drop=True)
    top_count['average_count'] = top_count['count'] / (end - start + 1)
    top_count['label'] = top_count['preusuel'] + ' - ' + top_count['average_count'].astype(int).map('{:,}'.format) + ' births'
    top_count = top_count.set_index('preusuel')
    top_count = top_count.sort_values(by='average_count', ascending=False)
    return top_list, top_count

def get_data_top(names, start, end, top, gender = 'all'):
    ''' returns the data for the top names for a given period (start to end) and selected gender
    gender : 0= all 1 = male and 2 = female'''
    if gender == 'male':
        data = names[names['gender']== 1]
    elif gender == 'female':
        data = names[names['gender']== 2]
    else:
        print("no gender selected, all names are returned (chose 'male' or 'female' to filter on gender)")
        data = names
    # get top names for period
    top_list, top_count = get_names_top(data, start, end, top)

    # filter data for top names
    data = data[data.preusuel.isin(top_list)]

    data['average_count'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'average_count'])
    data['label'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'label'])
    data = data.sort_values(by=['average_count'], ascending= False)

    return data


# get the top 10 for male and female names for the period 2010-2020
top_list_m, top_count_m = get_names_top(names_france_m, 2010, 2020, 10)
print(top_list_m)
top_list_f, _ = get_names_top(names_france_f, 2010, 2020, 10)
print(top_list_f)

# get the data for male for the top 10 names in 2010-2020 period
data_m_top = get_data_top(names_france, 2010,2020, 10,'male')
data_m_top.head()

['GABRIEL', 'LUCAS', 'LOUIS', 'JULES', 'HUGO', 'NATHAN', 'LÉO', 'ADAM', 'ARTHUR', 'RAPHAËL']
['EMMA', 'JADE', 'LOUISE', 'CHLOÉ', 'MANON', 'LÉA', 'LINA', 'LOLA', 'ALICE', 'CAMILLE']


Unnamed: 0,gender,preusuel,year,count,average_count,label
39152,1,GABRIEL,1955,560,5156.454545,"GABRIEL - 5,156 births"
39126,1,GABRIEL,1929,1521,5156.454545,"GABRIEL - 5,156 births"
39128,1,GABRIEL,1931,1484,5156.454545,"GABRIEL - 5,156 births"
39129,1,GABRIEL,1932,1600,5156.454545,"GABRIEL - 5,156 births"
39130,1,GABRIEL,1933,1430,5156.454545,"GABRIEL - 5,156 births"


In [None]:
#top_count_m = top_count_m.set_index('preusuel')
print(top_count_m.loc['GABRIEL', 'count'])
top_count_m

56721


Unnamed: 0_level_0,count,average_count,label
preusuel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GABRIEL,56721,5156.454545,"GABRIEL - 5,156 births"
LUCAS,54512,4955.636364,"LUCAS - 4,955 births"
LOUIS,50634,4603.090909,"LOUIS - 4,603 births"
JULES,47078,4279.818182,"JULES - 4,279 births"
HUGO,46546,4231.454545,"HUGO - 4,231 births"
NATHAN,46117,4192.454545,"NATHAN - 4,192 births"
LÉO,45319,4119.909091,"LÉO - 4,119 births"
ADAM,44384,4034.909091,"ADAM - 4,034 births"
ARTHUR,42633,3875.727273,"ARTHUR - 3,875 births"
RAPHAËL,42052,3822.909091,"RAPHAËL - 3,822 births"


# 2/ Visualisation of the top names by gender <br>1ER RENDU 

In [None]:
# Plot the top names for the period 2010-2020

# select parameters and get data
start_year, end_year, top = 1900 , 1910, 10
period = pd.DataFrame({'start': [start_year], 'end': [end_year]}, index=[0])
max_count = 60000
# colors_f = ['fuchsia', 'purple', 'pink', 'salmon', 'red', 'maroon', 'orange', 'yellow', 'gold', 'peach']
# colors_m = ['blue', 'green', 'lightblue', 'teal', 'navy', 'turquoise', 'lime', 'olive', 'cyan', 'aqua']
colors_f = ['fuchsia', 'purple', 'red', 'coral', 'orange', 'yellow', 'pink' ,'gray', 'darkgray', 'brown' ]
colors_m = ['blue', 'green','navy',  'skyblue', 'limegreen', 'olive', 'green', 'teal', 'cyan', 'yellow', 'lightgray', 'gray', 'darkgray', 'black']
line_styles = [[0],[0],[0],[0], [0],[5, 2],[5, 2],[5, 2],[5, 2],[5, 2]]


# create the selection
# ------------------------------------------------
# Selection for clicking or dragging
single = alt.selection_point(encodings=['x'])
interval = alt.selection_interval(encodings=['x'])


# GIRLS
#------------------------------------------------
# plot the period selected
selection_f = alt.Chart(period).mark_rect(color='fuchsia', opacity=0.2).encode(
    x='start:Q',
    x2='end:Q',
    # title='Period selected'
)

# plot the top names
data_f = get_data_top(names_france, start_year, end_year, top,'female')
top_graph_f = alt.Chart(data_f).mark_line().encode(
    x=alt.X('year:Q', title='Year'), 
    y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))), 
    color= alt.Color('label:N', 
                     legend=alt.Legend(title="Names"),
                     sort=alt.EncodingSortField(field='average_count', order='descending'),
                     scale=alt.Scale(domain=data_f['label'].unique(),range=colors_f)
                     #scale=alt.Scale(domain=list(colors_f.keys()),range=list(colors_f.values()))
                     ),                 
    tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
).properties(
    title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year}','(select period by clicking on graph)','',f'GIRLS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
    width=600,
    height=250
).add_params(
    single, interval
)


# BOYS
#------------------------------------------------
# plot the period selected
selection_m = alt.Chart(period).mark_rect(color='blue', opacity=0.1).encode(
    x='start:Q',
    x2='end:Q',
    # title='Period selected'
)
# plot the top names
data_m = get_data_top(names_france, start_year, end_year, top,'male')
top_graph_m = alt.Chart(data_m).mark_line().encode(
    x=alt.X('year:Q', title='Year'), 
    y=alt.Y('count:Q', title='Count',scale=alt.Scale(domain=(0, max_count))),  
    color= alt.Color('label:N', 
                     legend=alt.Legend(title="Names"),
                     sort=alt.EncodingSortField(field='average_count', order='descending'),
                     scale=alt.Scale(domain=data_m['label'].unique() , range = colors_m),
                     ),    
    # strokeDash = alt.StrokeDash('label:N', scale=alt.Scale(domain=data_m['label'].unique(), range = line_styles)),            
    tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
).properties(
    title=[f'BOYS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
    width=600,
    height=250
).add_params(
    single, interval
)


# DISPLAY FINAL CHART
#--------------------

final_chart = alt.vconcat(
    top_graph_f + selection_f,
    top_graph_m + selection_m
).resolve_scale(
    color='independent'  # Each chart will use its own color scale
).resolve_legend(
    color='independent'  # Each chart will have its own color legend
)

final_chart


In [None]:
# FUNCTION TO DISPLAY THE CHART
# ------------------------------
def display_chart(start_year=2010, end_year=2020, top=10):

    period = pd.DataFrame({'start': [start_year], 'end': [end_year]}, index=[0])
    max_count = 60000
    colors_f = ['fuchsia', 'purple', 'red', 'coral', 'orange', 'yellow', 'pink' ,'gray', 'darkgray', 'brown' ]
    colors_m = ['blue', 'green','navy',  'skyblue', 'limegreen', 'olive', 'green', 'teal', 'cyan', 'yellow', 'lightgray', 'gray', 'darkgray', 'black']

    # create the selection
    # ------------------------------------------------
    # Selection for clicking or dragging
    single = alt.selection_point(encodings=['x'])
    interval = alt.selection_interval(encodings=['x'])

    # GIRLS
    #------------------------------------------------
    # plot the period selected
    selection_f = alt.Chart(period).mark_rect(color='fuchsia', opacity=0.2).encode(
        x='start:Q',
        x2='end:Q',
        # title='Period selected'
    )

    # plot the top names
    data_f = get_data_top(names_france, start_year, end_year, top,'female')
    top_graph_f = alt.Chart(data_f).mark_line().encode(
        x=alt.X('year:Q', title='Year'), 
        y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))), 
        color= alt.Color('label:N', 
                        legend=alt.Legend(title="Top girl names"),
                        sort=alt.EncodingSortField(field='average_count', order='descending'),
                        scale=alt.Scale(domain=data_f['label'].unique(),range=colors_f)
                        #scale=alt.Scale(domain=list(colors_f.keys()),range=list(colors_f.values()))
                        ),                 
        tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year}','(select period by clicking on graph)','',f'GIRLS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )


    # BOYS
    #------------------------------------------------
    # plot the period selected
    selection_m = alt.Chart(period).mark_rect(color='blue', opacity=0.1).encode(
        x='start:Q',
        x2='end:Q',
        # title='Period selected'
    )
    # plot the top names
    data_m = get_data_top(names_france, start_year, end_year, top,'male')
    top_graph_m = alt.Chart(data_m).mark_line().encode(
        x=alt.X('year:Q', title='Year'), 
        y=alt.Y('count:Q', title='Count',scale=alt.Scale(domain=(0, max_count))),  
        color= alt.Color('label:N', 
                        legend=alt.Legend(title="Top boy names"),
                        sort=alt.EncodingSortField(field='average_count', order='descending'),
                        scale=alt.Scale(domain=data_m['label'].unique() , range = colors_m),
                        ),    
        # strokeDash = alt.StrokeDash('label:N', scale=alt.Scale(domain=data_m['label'].unique(), range = line_styles)),            
        tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'BOYS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )

    # DISPLAY FINAL CHART
    #--------------------

    final_chart = alt.vconcat(
        top_graph_f + selection_f,
        top_graph_m + selection_m
    ).resolve_scale(
        color='independent'  # Each chart will use its own color scale
    ).resolve_legend(
        color='independent'  # Each chart will have its own color legend
    )

    final_chart.display()

In [None]:
# create a video of name evolutions

import time
display_chart(1900, 1910, 10)
time.sleep(3)

for i in range(1,12):
    clear_output(wait=True)
    display_chart(1900+i*10, 1910+i*10, 10)
    time.sleep(1)

# 3/ Visualisation of the top names by ratio (% of births) <br>RENDU FINAL

## 3.1 Adding maximum births by year (for reference)

In [None]:
# select the top 10 female and male names for a given period

def get_top_names(names, births,start, end, top):
    names_period = names[names.year.between(start, end)]
    births_period = births[births.year.between(start, end)]

    top_count = names_period.groupby(['preusuel'])['count'].sum().reset_index()
    top_count = top_count.sort_values(by='count', ascending=False).head(top)
    top_list = top_count['preusuel'].tolist()
    top_count = top_count.reset_index(drop=True)

    # average births per year for the period
    top_count['average_count'] = top_count['count'] / (end - start + 1)
    top_count['label'] = top_count['preusuel'] + ' - ' + top_count['average_count'].astype(int).map('{:,}'.format) + ' births'
    top_count = top_count.sort_values(by='average_count', ascending=False)

    # ratio of births for the top names for the period
    top_count['average_ratio'] = top_count['count'] / births_period['births'].sum()
    top_count['average_ratio%'] = (top_count['average_ratio']*100).round(1).astype(str) + '%'
    top_count['label_ratio'] = top_count['preusuel'] + ' - ' + top_count['average_ratio%'] + ' of births'
    
    # set names as index
    top_count = top_count.set_index('preusuel')
    
    return top_list, top_count

def get_top_data(names, births, start, end, top, gender = 'all'):
    ''' returns the data for the top names for a given period (start to end) and selected gender
    gender : 0= all 1 = male and 2 = female'''

    # process data for gender
    if gender == 'male':
        data = names[names['gender']== 1]
        births = births[births['gender']== 1]
    elif gender == 'female':
        data = names[names['gender']== 2]
        births = births[births['gender']== 2]
    else:
        print("no gender selected, all names are returned (chose 'male' or 'female' to filter on gender)")
        data = names

    # get top names for period
    top_list, top_count = get_top_names(data, births, start, end, top)

    # filter data for top names
    data = data[data.preusuel.isin(top_list)]

    # get average count and ratio for the top names
    data['average_count'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'average_count'])
    data['label'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'label'])
    data['average_ratio'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'average_ratio'])
    data['label_ratio'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'label_ratio'])
    data = data.sort_values(by=['average_count'], ascending= False)

    # add a ratio column to data
    data = data.merge(births, on='year')
    data['ratio'] = data['count'] / data['births']
    data['ratio%'] = (data['ratio']*100).round(1).astype(str) + '%'
    data['births'] = data['count'].map('{:,}'.format)
    data['name'] = data['preusuel']
    
    return data

def get_max_birth(df,births, gender='all'):
    max_births = pd.Series(index=range(1900,2020))
    for year in range(1900,2020):
        if gender == 'male':
            max_births[year] = df[(df.year == year) & (df.gender == 1)]['count'].max()
            births = births[births['gender']== 1]
        elif gender == 'female':
            max_births[year] = df[(df.year == year) & (df.gender == 2)]['count'].max()
            births = births[births['gender']== 2]
        else:
            max_births[year] = df[df.year == year]['count'].max()
    
    max_births = pd.DataFrame({'year': max_births.index, 'max_count': max_births.values})
    max_births = max_births.merge(births, on='year')
    max_births['max_ratio'] = max_births['max_count'] / max_births['births']

    return max_births

def get_annotations(df, list):
    year_f = []
    count_f= []
    year_f_ratio = []
    ratio_f = []

    for name in list: # loop on count of names
        index = df[df['name'] == name]['count'].idxmax()
        year_f.append(df.loc[index, 'year'])
        count_f.append(df.loc[index, 'count'])

    for name in list: # loop on ratio of names
        index = df[df['name'] == name]['ratio'].idxmax()
        year_f_ratio.append(df.loc[index, 'year'])
        ratio_f.append(df.loc[index, 'ratio'])
    annotations_f_count = pd.DataFrame({'name': list, 'year': year_f, 'count': count_f })
    annotations_f_ratio = pd.DataFrame({'name': list, 'year': year_f_ratio, 'ratio': ratio_f })
    
    return annotations_f_count, annotations_f_ratio


In [None]:
# TOP NAMES OVER TIME (FOR REFERENCE)

# GIRLS
max_f = get_top_data(names_france, births, 1900 , 2020, 300,'female')
list_f = ['MARIE','JEANNE','LOUISE','MARGUERITE','GERMAINE','SUZANNE','MARTINE','BRIGITTE','JACQUELINE','MONIQUE','NICOLE','FRANÇOISE','CHANTAL','CATHERINE','LÉA','EMMA','ELODIE','MANON','NATHALIE','SYLVIE','STÉPHANIE','CÉLINE','AURÉLIE','CHLOÉ']
max_f = max_f[max_f['name'].isin(list_f)]
# Get position to display the names on graph
annot_f_count, annot_f_ratio = get_annotations(max_f, list_f)
step_count = 5000
step_ratio = 0.008
adjust_f_count = { 'MARGUERITE': 0.5, 'GERMAINE': 0.3,  'FRANÇOISE': 0.2, 'CATHERINE': 0, 'AURÉLIE': 0.75, 'MONIQUE':1.5,'JACQUELINE':0, 'NICOLE':0.5, 'CHANTAL':0.15}
adjust_f_ratio = { 'MARGUERITE': 1, 'GERMAINE': 0.5,  'FRANÇOISE': 0.25, 'CATHERINE': 0.5, 'AURÉLIE': 0.75, 'MONIQUE':0.25,'JACQUELINE':0, 'NICOLE':0.75, 'CHANTAL':-0.25}
for name in adjust_f_count:
    i = annot_f_count[annot_f_count['name'] == name].index[0]
    annot_f_count.loc[i, 'count'] += adjust_f_count[name] * step_count
for name in adjust_f_ratio:
    i = annot_f_ratio[annot_f_ratio['name'] == name].index[0]
    annot_f_ratio.loc[i, 'ratio'] += adjust_f_ratio[name] * step_ratio

# BOYS
max_m = get_top_data(names_france, births, 1900 , 2020, 300,'male')
list_m = ['JEAN','LOUIS','PAUL','PIERRE','ANDRÉ','CLAUDE','BERNARD','NICOLAS','STÉPHANE','MICHEL','ALAIN','PATRICK','GABRIEL','THIERRY','ENZO','LUCAS','SÉBASTIEN','JULIEN','KEVIN','THOMAS','PHILIPPE','CHRISTOPHE']
max_m = max_m[max_m['name'].isin(list_m)]
# Get position to display the names on graph
annot_m_count, annot_m_ratio = get_annotations(max_m, list_m)
adjust_m_count = {'PHILIPPE': 0.5, 'THIERRY': 0.25, 'CHRISTOPHE':0.25, 'STÉPHANE':0.5, 'THOMAS':0.5, 'LUCAS':0.25}
adjust_m_ratio = {'PHILIPPE': 1.25, 'THIERRY': 0.25, 'CHRISTOPHE':1.5, 'THOMAS':2, 'LUCAS':1, 'SÉBASTIEN':0.5, 'NICOLAS':1.5}
for name in adjust_m_count:
    i = annot_m_count[annot_m_count['name'] == name].index[0]
    annot_m_count.loc[i, 'count'] += adjust_m_count[name] * step_count
for name in adjust_m_ratio:
    i = annot_m_ratio[annot_m_ratio['name'] == name].index[0]
    annot_m_ratio.loc[i, 'ratio'] += adjust_m_ratio[name] * step_ratio

# annot_f_ratio

In [None]:
# -----------------------------
# MAXIMUM BIRTHS (COUNT)
# -----------------------------

max_count = 60000

# GIRLS
# ---------------------------------------------------------------

# Plot the maximum births area for each year
max_birth_f = get_max_birth(names_france, births, 'female')
max_f_area = alt.Chart(max_birth_f).mark_area(
    color='lightgrey',  # very light grey area
    opacity=0.3  # make the area slightly transparent
).encode(
    x='year:Q',  # Quantitative scale for year
    y='max_count:Q',  # Quantitative scale for the max count
)

# plot the names reaching maximum births in light gray for reference (MARIE, JEANNE, LOUISE; etc)
max_f_graph = alt.Chart(max_f).mark_line(
        color='lightgray', 
        size=1
).encode(
    x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')),
    y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))),
    color=alt.Color('label:N', legend=None,
                    scale=alt.Scale(domain=max_f['label'].unique() , range = ['lightgray','lightgray','lightgray']) ), 
    tooltip=['year', 'name', 'births','ratio%']
)

# Create test annotation next to the maximum value of each name
text_marks_f = alt.Chart(annot_f_count).mark_text(
    align='left',
    baseline='middle',
    dx=0,  # Adjust horizontal position relative to the point
    dy=-5,  # Adjust vertical position relative to the point
    color='gray',  
    fontSize=8
).encode(
    x='year:Q',  
    y='count:Q',
    text='name:N'  
)

max_f_count_3 = (max_f_area + max_f_graph + text_marks_f).properties(title='Girls maximum births (COUNTS)', width=600, height=250).display()


# BOYS MAXIMUM BIRTHS (COUNT)
# -----------------------------

# plot the names reaching maximum births in light gray for reference (JEAN, LOUIS, PAUL, NICOLAS, SÉBASTIEN, JULIEN, KEVIN, THOMAS, PHILIPPE, CHRISTOPHE, ALAIN)
max_m_graph = alt.Chart(max_m).mark_line(
        color='lightgray', 
        size=1
).encode(
    x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')),
    y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))),
    color=alt.Color('label:N', legend=None,
                    scale=alt.Scale(domain=max_m['label'].unique() , range = ['lightgray','lightgray','lightgray'])),
    tooltip=['year', 'name', 'births','ratio%']
)

# Plot the maximum births for each year

max_birth_m = get_max_birth(names_france, births, 'male')
max_m_area = alt.Chart(max_birth_m).mark_area(
    color='lightgrey',  # very light grey area
    opacity=0.3  # make the area slightly transparent
).encode(
    x='year:Q',  # Quantitative scale for year
    y='max_count:Q',  # Quantitative scale for the max count
)

# Create test annotation next to the maximum value of each name
text_marks_m = alt.Chart(annot_m_count).mark_text(
    align='left',
    baseline='middle',
    dx=0,  # Adjust horizontal position relative to the point
    dy=-5,  # Adjust vertical position relative to the point
    color='gray',  
    fontSize=8
).encode(
    x='year:Q',  
    y='count:Q',
    text='name:N'  
)

max_m_count_3 = (max_m_area + max_m_graph + text_marks_m).properties(title='Boys maximum births (COUNTS)',width=600, height=250).display()

In [None]:

# -----------------------------
# MAXIMUM BIRTHS (RATIO)
# -----------------------------

# FEMALE MAXIMUM BIRTHS (RATIO)
# -----------------------------
max_ratio = 0.2

# plot the names reaching maximum births in light gray for reference (MARIE, JEANNE, LOUISE; etc)
max_f_graph_ratio = alt.Chart(max_f).mark_line(
        color='lightgray', 
        size=1
).encode(
    x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')),
    y=alt.Y('ratio:Q', title='Count', scale=alt.Scale(domain=(0, max_ratio))),
    color=alt.Color('label_ratio:N', legend=None,
                    scale=alt.Scale(domain=max_f['label_ratio'].unique() , range = ['lightgray','lightgray','lightgray']) ), 
    tooltip=['year', 'name', 'births','ratio%']
)
# Plot the maximum births for each year
max_birth_f = get_max_birth(names_france, births, 'female')
max_f_area_ratio = alt.Chart(max_birth_f).mark_area(
    color='lightgrey',  # very light grey area
    opacity=0.3  # make the area slightly transparent
).encode(
    x='year:Q',  # Quantitative scale for year
    y='max_ratio:Q',  # Quantitative scale for the max count
)

# Create test annotation next to the maximum value of each name
text_marks_f_ratio = alt.Chart(annot_f_ratio).mark_text(
    align='left',
    baseline='middle',
    dx=0,  # Adjust horizontal position relative to the point
    dy=-5,  # Adjust vertical position relative to the point
    color='gray',  
    fontSize=8
).encode(
    x='year:Q',  
    y='ratio:Q',
    text='name:N'  
)

max_f_ratio_3 = (max_f_area_ratio + max_f_graph_ratio + text_marks_f_ratio).properties(title='Girls maximum births (RATIO)',width=600, height=250).display()


# BOYS MAXIMUM BIRTHS (RATIO)
# -----------------------------

# plot the names reaching maximum births in light gray for reference (JEAN, LOUIS, PAUL, NICOLAS, SÉBASTIEN, JULIEN, KEVIN, THOMAS, PHILIPPE, CHRISTOPHE, ALAIN)
max_m_graph_ratio = alt.Chart(max_m).mark_line(
        color='lightgray', 
        size=1
).encode(
    x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')),
    y=alt.Y('ratio:Q', title='Count', scale=alt.Scale(domain=(0, max_ratio))),
    color=alt.Color('label_ratio:N', legend=None,
                    scale=alt.Scale(domain=max_m['label_ratio'].unique() , range = ['lightgray','lightgray','lightgray'])),
    tooltip=['year', 'name', 'births','ratio%']
)

# Plot the maximum births for each year
max_birth_m = get_max_birth(names_france, births, 'male')
max_m_area_ratio = alt.Chart(max_birth_m).mark_area(
    color='lightgrey',  # very light grey area
    opacity=0.3  # make the area slightly transparent
).encode(
    x='year:Q',  # Quantitative scale for year
    y='max_ratio:Q',  # Quantitative scale for the max count
)

# Create test annotation next to the maximum value of each name
text_marks_m_ratio = alt.Chart(annot_m_ratio).mark_text(
    align='left',
    baseline='middle',
    dx=0,  # Adjust horizontal position relative to the point
    dy=-5,  # Adjust vertical position relative to the point
    color='gray',  
    fontSize=8
).encode(
    x='year:Q',  
    y='ratio:Q',
    text='name:N'  
)

max_m_ratio_3 = (max_m_area_ratio + max_m_graph_ratio + text_marks_m_ratio).properties(title='Boys maximum births (RATIO)',width=600, height=250).display()

## 3.2 Improving the visualisation

In [None]:
# FUNCTION TO DISPLAY THE CHART
# ------------------------------

######## COUNT + FINAL ###########

def display_chart_count(start_year=2010, end_year=2020, top=10):

    period = pd.DataFrame({'start': [start_year], 'end': [end_year]}, index=[0])
    max_count = 60000
    colors_f = ['fuchsia', 'purple', 'red', 'coral', 'gold', 'yellow', 'pink' ,'gray', 'darkgray', 'brown' ]
    colors_m = ['blue', 'green','navy',  'skyblue', 'limegreen', 'olive', 'green', 'teal', 'cyan', 'yellow', 'lightgray', 'gray', 'darkgray', 'black']

    # --------------------------------------------------------------------------------------------------------------------
    # GIRLS
    # --------------------------------------------------------------------------------------------------------------------
    
    # plot the period selected
    # ------------------------
    selection_f = alt.Chart(period).mark_rect(color='fuchsia', opacity=0.2).encode(
        x='start:Q',
        x2='end:Q',
    )

    # plot the top names
    # ------------------
    data_f = get_top_data(names_france, births, start_year, end_year, top,'female')
    top_graph_f = alt.Chart(data_f).mark_line().encode(
        x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')), 
        y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))), 
        color= alt.Color('label:N', 
                        legend=alt.Legend(title=["Boys top names","(% of births in period)"]),
                        sort=alt.EncodingSortField(field='average_count', order='descending'),
                        scale=alt.Scale(domain=data_f['label'].unique(),range=colors_f)
                        #scale=alt.Scale(domain=list(colors_f.keys()),range=list(colors_f.values()))
                        ),                 
        tooltip=['year', 'name', 'births','ratio%'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year} (NUMBER OF BIRTHS)','(select period by clicking on graph)','',f'GIRLS NAMES - Evolution of the {top} most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )

    chart_f = alt.layer(
        max_f_graph, # Reference chart in light gray (will not change)
        top_graph_f,   # Main chart with top 10 names
        selection_f,   # selected period
        text_marks_f
    ).properties(width=600,height=250,
    ).resolve_scale(color='independent'  # Ensures each layer maintains its own color scale
    )


    # --------------------------------------------------------------------------------------------------------------------
    # BOYS
    #---------------------------------------------------------------------------------------------------------------------
    # plot the period selected
    # ------------------------
    selection_m = alt.Chart(period).mark_rect(color='blue', opacity=0.1).encode(
        x='start:Q',
        x2='end:Q',
    )

    # plot the top names
    # ------------------
    data_m = get_top_data(names_france, births, start_year, end_year, top,'male')
    top_graph_m = alt.Chart(data_m).mark_line().encode(
        x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')), 
        y=alt.Y('count:Q', title='Count',scale=alt.Scale(domain=(0, max_count))),  
        color= alt.Color('label:N', 
                        legend=alt.Legend(title=["Boys top names","(% of births in period)"]),
                        sort=alt.EncodingSortField(field='average_count', order='descending'),
                        scale=alt.Scale(domain=data_m['label'].unique() , range = colors_m),
                        ),    
        # strokeDash = alt.StrokeDash('label:N', scale=alt.Scale(domain=data_m['label'].unique(), range = line_styles)),            
        tooltip=['year', 'name', 'births','ratio%'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'BOYS NAMES - Evolution of the {top} most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )

    chart_m = alt.layer(
        max_m_graph, # Reference chart in light gray (will not change)
        top_graph_m,   # Main chart with top 10 names
        selection_m,    # selected period
        text_marks_m
        # max_birth_graph_m # gray zone indicating max births for each year
    ).properties(width=600, height=250,
    ).resolve_scale(color='independent'  # Ensures each layer maintains its own color scale
    )
  
    
    # DISPLAY FINAL CHART
    #--------------------

    final_chart = alt.vconcat(
        chart_f + max_f_area,
        chart_m + max_m_area
    ).resolve_scale(color='independent'  # Each chart will use its own color scale
    ).resolve_legend(color='independent'  # Each chart will have its own color legend
    )

    final_chart.display()

######## COUNT ###########

In [None]:
display_chart_count(1960, 1970, 10)

In [None]:
# RATIO + COUNT : FUNCTION N°2 TO DISPLAY THE CHART
# --------------------------------------------

########### RATIO FINAL ##############

def display_chart_ratio(start_year=2010, end_year=2020, top=10):
    #range = pd.DataFrame({'1900': [1900], '2020': [2020]}, index=[0])
    period = pd.DataFrame({'start': [start_year], 'end': [end_year]}, index=[0])
    max_ratio = 0.2
    colors_f = ['fuchsia', 'purple', 'red', 'coral', 'gold', 'yellow', 'pink' ,'gray', 'darkgray', 'brown' ]
    colors_m = ['blue', 'green','navy',  'skyblue', 'limegreen', 'olive', 'green', 'teal', 'cyan', 'yellow', 'lightgray', 'gray', 'darkgray', 'black']

    # # create the selection
    # # ------------------------------------------------
    # # Selection for clicking or dragging
    # single = alt.selection_point(encodings=['x'])
    # interval = alt.selection_interval(encodings=['x'])

    # --------------------------------------------------------------------------------------------------------------------
    # GIRLS
    #---------------------------------------------------------------------------------------------------------------------
    # plot the period selected
    selection_f = alt.Chart(period).mark_rect(color='fuchsia', opacity=0.2).encode(
        x='start:Q',
        x2='end:Q',
        # title='Period selected'
    )

    # plot the top names
    data_f = get_top_data(names_france, births, start_year, end_year, top,'female')

    top_graph_f = alt.Chart(data_f).mark_line().encode(
        x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')), 
        y=alt.Y('ratio:Q', title='% of girl births', 
                scale=alt.Scale(domain=(0, max_ratio)),
                axis=alt.Axis(format='%', titlePadding=10)
                ),  
        color= alt.Color('label_ratio:N', 
                        legend=alt.Legend(title=["Girls top names","(% of births in period)"]),
                        sort=alt.EncodingSortField(field='average_ratio', order='descending'),
                        scale=alt.Scale(domain=data_f['label_ratio'].unique(),range=colors_f)
                        #scale=alt.Scale(domain=list(colors_f.keys()),range=list(colors_f.values()))
                        ),                 
        tooltip=['year', 'name', 'births','ratio%'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year} (% OF BIRTHS)','(select period by clicking on graph)','',f'GIRLS NAMES - Evolution the {top} most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )

    chart_f_ratio = alt.layer(
        max_f_graph_ratio, # Reference chart in light gray (will not change)
        top_graph_f,   # Main chart with top 10 names
        selection_f,    # selected period
        text_marks_f_ratio
    ).properties(width=600, height=250,
    ).resolve_scale(color='independent'  # Ensures each layer maintains its own color scale
    )

    #--------------------------------------------------------------------------------------------------------------------
    # BOYS
    #---------------------------------------------------------------------------------------------------------------------
    
    # plot the period selected
    # ------------------------
    selection_m = alt.Chart(period).mark_rect(color='blue', opacity=0.1).encode(
        x='start:Q',
        x2='end:Q',
        # title='Period selected'
    )

    # plot the top names
    # ------------------
    data_m = get_top_data(names_france, births, start_year, end_year, top,'male')
    top_graph_m = alt.Chart(data_m).mark_line().encode(
        x=alt.X('year:Q', title='Year', axis=alt.Axis(format='d')), 
        y=alt.Y('ratio:Q', title='% of boy births',
                scale=alt.Scale(domain=(0, max_ratio)),
                axis=alt.Axis(format='%', titlePadding=10)
                ),  
        color= alt.Color('label_ratio:N', 
                        legend=alt.Legend(title=["Boys top names","(% of births in period)"]),
                        sort=alt.EncodingSortField(field='average_ratio', order='descending'),
                        scale=alt.Scale(domain=data_m['label_ratio'].unique() , range = colors_m),
                        ),    
        # strokeDash = alt.StrokeDash('label:N', scale=alt.Scale(domain=data_m['label'].unique(), range = line_styles)),            
        tooltip=['year', 'name', 'births', 'ratio%'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'BOYS NAMES - Evolution of the {top} most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )

    chart_m_ratio = alt.layer(
        max_m_graph_ratio, # Reference chart in light gray (will not change)
        top_graph_m,   # Main chart with top 10 names
        selection_m,    # selected period
        text_marks_m_ratio
    ).properties(width=600,height=250,
    ).resolve_scale(color='independent'  # Ensures each layer maintains its own color scale
    )

    # --------------------------------------------------------------------------------------------------------------------
    # DISPLAY FINAL CHART
    # --------------------------------------------------------------------------------------------------------------------

    final_chart_ratio = alt.vconcat(
        chart_f_ratio + max_f_area_ratio,
        chart_m_ratio + max_m_area_ratio
    ).resolve_scale(color='independent'  # Each chart will use its own color scale
    ).resolve_legend(color='independent'  # Each chart will have its own color legend
    )

    final_chart_ratio.display()

########### RATIO ##############

In [None]:
display_chart_ratio(1900, 1910, 10)

In [None]:
# create a video of name evolutions (BY COUNT AND THEN BY RATIO)

import time

def display_by_decades_count(delay=3, first_delay=8, top=10):
    clear_output(wait=True)
    display_chart_count(1900, 1910, top)
    time.sleep(first_delay)
    for i in range(1,12):
        clear_output(wait=True)
        display_chart_count(1900+i*10, 1910+i*10, top)
        time.sleep(delay)

def display_by_decades_ratio(delay=3, first_delay=8, top=10):
    clear_output(wait=True)
    display_chart_ratio(1900, 1910, top)
    time.sleep(first_delay)
    for i in range(1,12):
        clear_output(wait=True)
        display_chart_ratio(1900+i*10, 1910+i*10, top)
        time.sleep(delay)

display_by_decades_count(delay=2, first_delay=25, top=7)
clear_output(wait=True)
display_by_decades_ratio(delay=2, first_delay=5, top=10)

# 3/ Interactive visualisation of the top names ALTAIR

In [None]:
# range = pd.DataFrame({'1900': [1900], '2020': [2020]}, index=[0])
# # create the selection
# # ------------------------------------------------
# # Selection for clicking or dragging
# single = alt.selection_point(encodings=['x'])
# interval = alt.selection_interval(encodings=['x'])

# title_graph_count =alt.Chart(range).mark_rect(color='lightgray', opacity=0.1).encode(
#         x=alt.X('1900:Q',scale=alt.Scale(domain=(1900, 2020)), axis=alt.Axis(format='d'),title=None,),
#         x2='2020:Q',       
#     ).properties(
#         title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year} (% OF BIRTHS)','(select period by clicking on graph)'],
#         width=600,
#         height=30
#     )#.add_params(single, interval)

# title_graph_count.display()

# 4/ Interactive visualisation with IPWIDGETS

In [None]:
import ipywidgets as widgets
from ipywidgets import HBox, VBox, Label
from IPython.display import display, clear_output

# Widgets for the UI
year_range_widget = widgets.IntRangeSlider(
    value=[2010, 2020],
    min=1900,
    max=2020,
    step=1,
    #description='Period:',
    continuous_update=False,
    layout=widgets.Layout(width='63%')  # Extended width
)
year_range_widget.style.handle_color = 'blue'  # Set handle color to black

top_n_widget = widgets.Combobox(
    value = '10',
    #placeholder="Top",
    # options=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '15', '20', '30', '40', '50', '100'],
    description="Top",
    ensure_option=False, # to enable user to write any number 
    disabled=False,
    layout=widgets.Layout(width='130px')  # Adjust width as needed

)

type_info_widget = widgets.Checkbox(
    value=False,
    description='Show % of births',
    continuous_update=False,
    disabled=False
)

play = widgets.Play(value=1900, min=1900, max=2010, step=10, interval=5000, description="Press play")

# Functions to update the chart based on the widgets

def on_value_change(change):
    clear_output(wait=True)
    display(ui2)
    display(ui)
    start_year, end_year = year_range_widget.value  # Unpack the start and end years from the range slider
    top = int(top_n_widget.value)
    if type_info_widget.value:
        display_chart_ratio(start_year, end_year, top)
    else:
        display_chart_count(start_year, end_year, top)

def on_play_change(change):
    year = change.new
    year_range_widget.value = [year, year+10]


# Observe changes in the IntRangeSlider and Checkbox
year_range_widget.observe(on_value_change, names='value')
type_info_widget.observe(on_value_change, names='value')
play.observe(on_play_change, names='value')
top_n_widget.observe(on_value_change, names='value')

# Set up UI container
ui = widgets.HBox([Label('Period:'),year_range_widget])
ui2 = widgets.HBox([Label('View evolution over time'),play, type_info_widget, top_n_widget,Label('names')])
# Adjust height or margins if necessary
ui.layout.margin = '0 0 0 0'  # Top, right, bottom, left
ui2.layout.margin = '0 0 0 0'  # More margin on top

# Display the UI and the initial chart
display(ui2)
display(ui)
start_year, end_year = year_range_widget.value  # Unpack the initial range for the initial display
if type_info_widget.value:
    display_chart_ratio(2010, 2020, 10)
else:
    display_chart_count(2010, 2020, 10)


HBox(children=(Label(value='View evolution over time'), Play(value=1960, description='Press play', interval=50…

HBox(children=(Label(value='Period:'), IntRangeSlider(value=(1960, 1970), continuous_update=False, layout=Layo…