# IGR2023 - BABY VISUALISATION <br>Theme 3: Gender

In [1]:
import pandas as pd
import numpy as np

import altair as alt
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
alt.data_transformers.enable('json') # Let Altair/Vega-Lite work with large data sets

from itertools import product
import ipywidgets as widgets
from IPython.display import display, clear_output

In [2]:
# Load the data
names = pd.read_csv("../data/dpt2020.csv", sep=";")
names.rename(columns={'annais': 'year', 'nombre': 'count', 'sexe': 'gender'}, inplace=True)

names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt.isin(['XX', '971', '972', '973', '974'])].index, inplace=True)
names.drop(names[names.year == 'XXXX'].index, inplace=True)
names.year = names.year.astype(int)
names.gender = names.gender.astype(int)

# Group data on all of France
names_france = names.groupby(['gender', 'preusuel', 'year'])['count'].sum().reset_index()
names_france_m = names_france[names_france.gender == 1]
names_france_f = names_france[names_france.gender == 2]

# 0/ Explore the data

In [3]:

names.head()

Unnamed: 0,gender,preusuel,year,dpt,count
10885,1,AADIL,1983,84,3
10886,1,AADIL,1992,92,3
10888,1,AAHIL,2016,95,3
10892,1,AARON,1962,75,3
10893,1,AARON,1976,75,3


In [4]:
names_france.head()

Unnamed: 0,gender,preusuel,year,count
0,1,AADIL,1983,3
1,1,AADIL,1992,3
2,1,AAHIL,2016,3
3,1,AARON,1962,3
4,1,AARON,1976,3


In [5]:
print(names.describe())
print(names.info())
print(names_france.describe())
print(names_france.info())

             gender          year         count
count  3.471087e+06  3.471087e+06  3.471087e+06
mean   1.537662e+00  1.973882e+03  2.128088e+01
std    4.985796e-01  3.389481e+01  5.651373e+01
min    1.000000e+00  1.900000e+03  3.000000e+00
25%    1.000000e+00  1.949000e+03  4.000000e+00
50%    2.000000e+00  1.981000e+03  7.000000e+00
75%    2.000000e+00  2.003000e+03  1.800000e+01
max    2.000000e+00  2.020000e+03  6.310000e+03
<class 'pandas.core.frame.DataFrame'>
Index: 3471087 entries, 10885 to 3727550
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   gender    int32 
 1   preusuel  object
 2   year      int32 
 3   dpt       object
 4   count     int64 
dtypes: int32(2), int64(1), object(2)
memory usage: 132.4+ MB
None
              gender           year          count
count  225045.000000  225045.000000  225045.000000
mean        1.539434    1978.360226     328.235691
std         0.498444      33.704212    1430.666555
min         1.000000    1900.000

# 1/ Functions to get top names and corresponding data

In [6]:
# select the top 10 female and male names for a given period

def get_top_names(names,start, end, top):
    names_period = names[names.year.between(start, end)]
    top_count = names_period.groupby(['preusuel'])['count'].sum().reset_index()
    top_count = top_count.sort_values(by='count', ascending=False).head(top)
    top_list = top_count['preusuel'].tolist()
    top_count = top_count.reset_index(drop=True)
    top_count['average_count'] = top_count['count'] / (end - start + 1)
    top_count['label'] = top_count['preusuel'] + ' - ' + top_count['average_count'].astype(int).map('{:,}'.format) + ' births'
    top_count = top_count.set_index('preusuel')
    top_count = top_count.sort_values(by='average_count', ascending=False)
    return top_list, top_count

def get_data_top(names, start, end, top, gender = 'all'):
    ''' returns the data for the top names for a given period (start to end) and selected gender
    gender : 0= all 1 = male and 2 = female'''
    if gender == 'male':
        data = names[names['gender']== 1]
    elif gender == 'female':
        data = names[names['gender']== 2]
    else:
        print("no gender selected, all names are returned (chose 'male' or 'female' to filter on gender)")
        data = names
    # get top names for period
    top_list, top_count = get_top_names(data, start, end, top)

    # filter data for top names
    data = data[data.preusuel.isin(top_list)]

    data['average_count'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'average_count'])
    data['label'] = data['preusuel'].apply(lambda x: top_count.loc[x, 'label'])
    data = data.sort_values(by=['average_count'], ascending= False)

    return data


# get the top 10 for male and female names for the period 2010-2020
top_list_m, top_count_m = get_top_names(names_france_m, 2010, 2020, 10)
print(top_list_m)
top_list_f, _ = get_top_names(names_france_f, 2010, 2020, 10)
print(top_list_f)

# get the data for male for the top 10 names in 2010-2020 period
data_m_top = get_data_top(names_france, 2010,2020, 10,'male')
data_m_top.head()

['GABRIEL', 'LUCAS', 'LOUIS', 'JULES', 'HUGO', 'NATHAN', 'LÉO', 'ADAM', 'ARTHUR', 'RAPHAËL']
['EMMA', 'JADE', 'LOUISE', 'CHLOÉ', 'MANON', 'LINA', 'LOLA', 'LÉA', 'ALICE', 'CAMILLE']


Unnamed: 0,gender,preusuel,year,count,average_count,label
33978,1,GABRIEL,1948,1004,5014.272727,"GABRIEL - 5,014 births"
33959,1,GABRIEL,1929,1491,5014.272727,"GABRIEL - 5,014 births"
33961,1,GABRIEL,1931,1450,5014.272727,"GABRIEL - 5,014 births"
33962,1,GABRIEL,1932,1562,5014.272727,"GABRIEL - 5,014 births"
33963,1,GABRIEL,1933,1377,5014.272727,"GABRIEL - 5,014 births"


In [7]:
#top_count_m = top_count_m.set_index('preusuel')
print(top_count_m.loc['GABRIEL', 'count'])
top_count_m

55157


Unnamed: 0_level_0,count,average_count,label
preusuel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GABRIEL,55157,5014.272727,"GABRIEL - 5,014 births"
LUCAS,52875,4806.818182,"LUCAS - 4,806 births"
LOUIS,50131,4557.363636,"LOUIS - 4,557 births"
JULES,46622,4238.363636,"JULES - 4,238 births"
HUGO,45864,4169.454545,"HUGO - 4,169 births"
NATHAN,44817,4074.272727,"NATHAN - 4,074 births"
LÉO,44547,4049.727273,"LÉO - 4,049 births"
ADAM,43995,3999.545455,"ADAM - 3,999 births"
ARTHUR,41981,3816.454545,"ARTHUR - 3,816 births"
RAPHAËL,40813,3710.272727,"RAPHAËL - 3,710 births"


# 2/ Visualisation of the top names by gender

In [8]:
# Plot the top names for the period 2010-2020

# select parameters and get data
start_year, end_year, top = 1900 , 1910, 10
period = pd.DataFrame({'start': [start_year], 'end': [end_year]}, index=[0])
max_count = 60000
# colors_f = ['fuchsia', 'purple', 'pink', 'salmon', 'red', 'maroon', 'orange', 'yellow', 'gold', 'peach']
# colors_m = ['blue', 'green', 'lightblue', 'teal', 'navy', 'turquoise', 'lime', 'olive', 'cyan', 'aqua']
colors_f = ['fuchsia', 'purple', 'red', 'coral', 'orange', 'yellow', 'pink' ,'gray', 'darkgray', 'brown' ]
colors_m = ['blue', 'green','navy',  'skyblue', 'limegreen', 'olive', 'green', 'teal', 'cyan', 'yellow', 'lightgray', 'gray', 'darkgray', 'black']
line_styles = [[0],[0],[0],[0], [0],[5, 2],[5, 2],[5, 2],[5, 2],[5, 2]]


# create the selection
# ------------------------------------------------
# Selection for clicking or dragging
single = alt.selection_point(encodings=['x'])
interval = alt.selection_interval(encodings=['x'])


# GIRLS
#------------------------------------------------
# plot the period selected
selection_f = alt.Chart(period).mark_rect(color='fuchsia', opacity=0.2).encode(
    x='start:Q',
    x2='end:Q',
    # title='Period selected'
)

# plot the top names
data_f = get_data_top(names_france, start_year, end_year, top,'female')
top_graph_f = alt.Chart(data_f).mark_line().encode(
    x=alt.X('year:Q', title='Year'), 
    y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))), 
    color= alt.Color('label:N', 
                     legend=alt.Legend(title="Names"),
                     sort=alt.EncodingSortField(field='average_count', order='descending'),
                     scale=alt.Scale(domain=data_f['label'].unique(),range=colors_f)
                     #scale=alt.Scale(domain=list(colors_f.keys()),range=list(colors_f.values()))
                     ),                 
    tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
).properties(
    title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year}','(select period by clicking on graph)','',f'GIRLS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
    width=600,
    height=250
).add_params(
    single, interval
)


# BOYS
#------------------------------------------------
# plot the period selected
selection_m = alt.Chart(period).mark_rect(color='blue', opacity=0.1).encode(
    x='start:Q',
    x2='end:Q',
    # title='Period selected'
)
# plot the top names
data_m = get_data_top(names_france, start_year, end_year, top,'male')
top_graph_m = alt.Chart(data_m).mark_line().encode(
    x=alt.X('year:Q', title='Year'), 
    y=alt.Y('count:Q', title='Count',scale=alt.Scale(domain=(0, max_count))),  
    color= alt.Color('label:N', 
                     legend=alt.Legend(title="Names"),
                     sort=alt.EncodingSortField(field='average_count', order='descending'),
                     scale=alt.Scale(domain=data_m['label'].unique() , range = colors_m),
                     ),    
    # strokeDash = alt.StrokeDash('label:N', scale=alt.Scale(domain=data_m['label'].unique(), range = line_styles)),            
    tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
).properties(
    title=[f'BOYS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
    width=600,
    height=250
).add_params(
    single, interval
)


# DISPLAY FINAL CHART
#--------------------

final_chart = alt.vconcat(
    top_graph_f + selection_f,
    top_graph_m + selection_m
).resolve_scale(
    color='independent'  # Each chart will use its own color scale
).resolve_legend(
    color='independent'  # Each chart will have its own color legend
)

final_chart


In [31]:
# FUNCTION TO DISPLAY THE CHART
# ------------------------------
def display_chart(start_year=2010, end_year=2020, top=10):

    period = pd.DataFrame({'start': [start_year], 'end': [end_year]}, index=[0])
    max_count = 60000
    colors_f = ['fuchsia', 'purple', 'red', 'coral', 'orange', 'yellow', 'pink' ,'gray', 'darkgray', 'brown' ]
    colors_m = ['blue', 'green','navy',  'skyblue', 'limegreen', 'olive', 'green', 'teal', 'cyan', 'yellow', 'lightgray', 'gray', 'darkgray', 'black']

    # create the selection
    # ------------------------------------------------
    # Selection for clicking or dragging
    single = alt.selection_point(encodings=['x'])
    interval = alt.selection_interval(encodings=['x'])

    # GIRLS
    #------------------------------------------------
    # plot the period selected
    selection_f = alt.Chart(period).mark_rect(color='fuchsia', opacity=0.2).encode(
        x='start:Q',
        x2='end:Q',
        # title='Period selected'
    )

    # plot the top names
    data_f = get_data_top(names_france, start_year, end_year, top,'female')
    top_graph_f = alt.Chart(data_f).mark_line().encode(
        x=alt.X('year:Q', title='Year'), 
        y=alt.Y('count:Q', title='Count', scale=alt.Scale(domain=(0, max_count))), 
        color= alt.Color('label:N', 
                        legend=alt.Legend(title="Top girl names"),
                        sort=alt.EncodingSortField(field='average_count', order='descending'),
                        scale=alt.Scale(domain=data_f['label'].unique(),range=colors_f)
                        #scale=alt.Scale(domain=list(colors_f.keys()),range=list(colors_f.values()))
                        ),                 
        tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'MOST POPULAR NAMES FROM {start_year} TO {end_year}','(select period by clicking on graph)','',f'GIRLS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )


    # BOYS
    #------------------------------------------------
    # plot the period selected
    selection_m = alt.Chart(period).mark_rect(color='blue', opacity=0.1).encode(
        x='start:Q',
        x2='end:Q',
        # title='Period selected'
    )
    # plot the top names
    data_m = get_data_top(names_france, start_year, end_year, top,'male')
    top_graph_m = alt.Chart(data_m).mark_line().encode(
        x=alt.X('year:Q', title='Year'), 
        y=alt.Y('count:Q', title='Count',scale=alt.Scale(domain=(0, max_count))),  
        color= alt.Color('label:N', 
                        legend=alt.Legend(title="Top boy names"),
                        sort=alt.EncodingSortField(field='average_count', order='descending'),
                        scale=alt.Scale(domain=data_m['label'].unique() , range = colors_m),
                        ),    
        # strokeDash = alt.StrokeDash('label:N', scale=alt.Scale(domain=data_m['label'].unique(), range = line_styles)),            
        tooltip=['year', 'preusuel', 'count'] # data to show on hover (when mouse is over the line))  
    ).properties(
        title=[f'BOYS NAMES - Evolution of most popular names from {start_year} to {end_year}'],
        width=600,
        height=250
    ).add_params(
        single, interval
    )

    # DISPLAY FINAL CHART
    #--------------------

    final_chart = alt.vconcat(
        top_graph_f + selection_f,
        top_graph_m + selection_m
    ).resolve_scale(
        color='independent'  # Each chart will use its own color scale
    ).resolve_legend(
        color='independent'  # Each chart will have its own color legend
    )

    final_chart.display()

In [32]:
# create a video of name evolutions

import time
display_chart(1900, 1910, 10)
time.sleep(25)

for i in range(1,12):
    clear_output(wait=True)
    display_chart(1900+i*10, 1910+i*10, 10)
    time.sleep(3)

# 3/ Interactive visualisation of the top names

In [9]:
# Start_year and end_year widgets

start_year_widget = widgets.IntSlider(
    value=2011,
    min=1900,
    max=2020,
    step=1,
    description='Start Year:',
    continuous_update=False
)

end_year_widget = widgets.IntSlider(
    value=2020,
    min=1900,
    max=2020,
    step=1,
    description='End Year:',
    continuous_update=False
)

ui = widgets.HBox([start_year_widget, end_year_widget])


In [10]:
# FUNCTION TO UPDATE THE CHART





In [None]:
# FUNCTION THAT WILL BE CALLED WHEN THE WIDGETS CHANGE
# -----------------------------------------------------

def on_value_change(change):
    display_chart(start_year_widget.value, end_year_widget.value)
    
start_year_widget.observe(on_value_change, names='value')
end_year_widget.observe(on_value_change, names='value')

display(ui)
display_chart(start_year_widget.value, end_year_widget.value)
