# Libs import

In [1]:
import altair as alt
import pandas as pd
import numpy as np
from itertools import product
import ipywidgets as widgets
from IPython.display import display, clear_output
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
alt.data_transformers.enable('json') # Let Altair/Vega-Lite work with large data sets

pass

# Data loading and pre-processing

In [2]:
# Load the data
names = pd.read_csv("../data/dpt2020.csv", sep=";")
# Rename columns and drop some rows
names.rename(columns={'annais': 'decade', 'nombre': 'count', 'sexe': 'gender'}, inplace=True)
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt.isin(['XX', '971', '972', '973', '974'])].index, inplace=True)
names.drop(names[names.decade == 'XXXX'].index, inplace=True)
# Gather years as decades
names['decade'] = names['decade'].astype(int)//10*10
names = names.groupby(['gender', 'preusuel', 'decade', 'dpt'])['count'].sum().reset_index()
# Separate by gender
names['gender'] = names['gender'].astype(str)
names.loc[names['gender'] == '1', 'gender'] = 'Male'
names.loc[names['gender'] == '2', 'gender'] = 'Female'
names_m = names[names.gender == 'Male']
names_f = names[names.gender == 'Female']
# # Fill missing values - male
# decades = names_m['decade'].unique()
# dpts = names_m['dpt'].unique()
# preusuels = names_m['preusuel'].unique()
# all_combinations = pd.DataFrame(list(product(preusuels, dpts, decades)), columns=['preusuel', 'dpt', 'decade'])
# names_m = all_combinations.merge(names_m, on=['preusuel', 'dpt', 'decade'], how='left')
# names_m['count'] = names_m['count'].fillna(int(0))
# names_m['gender'] = names_m['gender'].fillna('Male')
# names_m['count'] = names_m['count'].astype(int)
# # Fill missing values - female
# decades = names_f['decade'].unique()
# dpts = names_f['dpt'].unique()
# preusuels = names_f['preusuel'].unique()
# all_combinations = pd.DataFrame(list(product(preusuels, dpts, decades)), columns=['preusuel', 'dpt', 'decade'])
# names_f = all_combinations.merge(names_f, on=['preusuel', 'dpt', 'decade'], how='left')
# names_f['count'] = names_f['count'].fillna(int(0))
# names_f['gender'] = names_f['gender'].fillna('Female')
# names_f['count'] = names_f['count'].astype(int)

In [3]:
# Concatenate
names = pd.concat([names_m, names_f])
names = names[['gender', 'preusuel', 'decade', 'dpt', 'count']].sort_values(by=['gender', 'preusuel', 'decade', 'dpt'], ascending=[False, True, True, True]).reset_index(drop=True)
names['rank'] = names.groupby(['gender', 'decade', 'dpt'])['count'].rank(method='first', ascending=False).astype(int)
names

Unnamed: 0,gender,preusuel,decade,dpt,count,rank
0,Male,AADIL,1980,84,3,324
1,Male,AADIL,1990,92,3,795
2,Male,AAHIL,2010,95,3,1042
3,Male,AARON,1960,75,3,821
4,Male,AARON,1970,75,3,857
...,...,...,...,...,...,...
711244,Female,ÉVELYNE,1950,88,3,241
711245,Female,ÉVY,2010,42,3,861
711246,Female,ÉVY,2010,57,3,928
711247,Female,ÉVY,2010,69,4,1383


In [4]:
# Load the data
depts_france = gpd.read_file('../data/departements-version-simplifiee.geojson')
# depts = gpd.read_file('../data/departements-avec-outre-mer.geojson')
# Rename columns and gather Corse data
depts_france.rename(columns={'nom': 'department'}, inplace=True)
depts_france.loc[depts_france['code'] == '2A', 'code'] = '20'
depts_france.loc[depts_france['code'] == '2B', 'code'] = '20'
depts_france.loc[depts_france['code'] == '20', 'department'] = 'Corse'
depts_france = depts_france.dissolve(by='code', as_index=False)
depts_france = depts_france[['code', 'department', 'geometry']]
depts_france

Unnamed: 0,code,department,geometry
0,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ..."
1,02,Aisne,"POLYGON ((4.04797 49.40564, 4.03991 49.39740, ..."
2,03,Allier,"POLYGON ((3.03207 46.79491, 3.04907 46.75808, ..."
3,04,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.69209 44.18648, ..."
4,05,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.29922 45.10855, ..."
...,...,...,...
90,91,Essonne,"POLYGON ((2.22656 48.77610, 2.23298 48.76620, ..."
91,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ..."
92,93,Seine-Saint-Denis,"POLYGON ((2.55306 49.00982, 2.58031 48.99159, ..."
93,94,Val-de-Marne,"POLYGON ((2.33190 48.81701, 2.36395 48.81632, ..."


In [5]:
names = depts_france.merge(names, how='right', left_on='code', right_on='dpt')
names

Unnamed: 0,code,department,geometry,gender,preusuel,decade,dpt,count,rank
0,84,Vaucluse,"MULTIPOLYGON (((4.65347 44.30210, 4.65062 44.3...",Male,AADIL,1980,84,3,324
1,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ...",Male,AADIL,1990,92,3,795
2,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",Male,AAHIL,2010,95,3,1042
3,75,Paris,"POLYGON ((2.41634 48.84924, 2.46226 48.84254, ...",Male,AARON,1960,75,3,821
4,75,Paris,"POLYGON ((2.41634 48.84924, 2.46226 48.84254, ...",Male,AARON,1970,75,3,857
...,...,...,...,...,...,...,...,...,...
711244,88,Vosges,"POLYGON ((5.47006 48.42093, 5.51099 48.41822, ...",Female,ÉVELYNE,1950,88,3,241
711245,42,Loire,"POLYGON ((3.89953 46.27591, 3.90940 46.25773, ...",Female,ÉVY,2010,42,3,861
711246,57,Moselle,"POLYGON ((5.89340 49.49691, 5.93994 49.50097, ...",Female,ÉVY,2010,57,3,928
711247,69,Rhône,"POLYGON ((4.38808 46.21979, 4.39205 46.26302, ...",Female,ÉVY,2010,69,4,1383


In [6]:
# Creation of a blank map dataframe in order to still display the departments even if there is no data
blank_map = names[['department', 'dpt', 'geometry']].drop_duplicates().reset_index(drop=True)
blank_map['count'] = int(0)
blank_map

Unnamed: 0,department,dpt,geometry,count
0,Vaucluse,84,"MULTIPOLYGON (((4.65347 44.30210, 4.65062 44.3...",0
1,Hauts-de-Seine,92,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ...",0
2,Val-d'Oise,95,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",0
3,Paris,75,"POLYGON ((2.41634 48.84924, 2.46226 48.84254, ...",0
4,Finistère,29,"MULTIPOLYGON (((-3.65428 48.61697, -3.64110 48...",0
...,...,...,...,...
90,Gers,32,"POLYGON ((0.07605 43.98314, 0.14096 43.99468, ...",0
91,Haute-Loire,43,"POLYGON ((3.89741 45.35708, 3.91694 45.33940, ...",0
92,Lozère,48,"POLYGON ((3.36134 44.97141, 3.38637 44.95274, ...",0
93,Creuse,23,"POLYGON ((2.16779 46.42407, 2.19757 46.42830, ...",0


# Make visualization

In [7]:
blank_map_chart = alt.Chart(blank_map).mark_geoshape(stroke='black'
).encode(
    color=alt.Color('count:Q', legend=None, scale=alt.Scale(domain=[blank_map['count'].min(), blank_map['count'].max()], range=['white', 'white']))
).properties(
    width=700,
    height=550
)

# Function to create bar charts for top names
def create_bar_chart(gender, department, decade, color='blue'):
    filtered_data = names[(names['gender'] == gender) & (names['decade'] == decade) & (names['department'] == department)]
    top_names = filtered_data.nlargest(10, 'count')
    bar_chart = alt.Chart(top_names).mark_bar(color=color).encode(
        y=alt.Y('preusuel:N', sort='-x', title='Names'),
        x=alt.X('count:Q', title='Count'),
        tooltip=['preusuel', 'count']
    ).properties(
        width=200,
        height=250,
        title=f'Top 10 {gender} Names in {department} during the {decade}s'
    )
    return bar_chart

# Function to update name dropdown and map based on gender selection
def update_names_dropdown(change):
    filtered_names = names[names['gender'] == change.new]['preusuel'].unique().tolist()
    name_dropdown.options = filtered_names
    update_map(change)

# Function to update the map and bar charts when any widget value changes
def update_map(change):
    with output:
        clear_output(wait=True)
        try:
            name = name_dropdown.value
            gender = gender_dropdown.value
            department = dpt_dropdown.value
            decade = int(decade_slider.value.split('-')[0])
            male_chart = create_bar_chart('Male', department, decade, color='lightblue')
            female_chart = create_bar_chart('Female', department, decade, color='pink')
            subset = names[(names['preusuel'] == name) & (names['gender'] == gender) & (names['decade'] == decade)]
            if not subset.empty:
                map_chart = alt.Chart(subset).mark_geoshape(
                    stroke='black'
                ).encode(
                    tooltip=['department', 'code', 'count', 'rank'],
                    color=alt.Color('count:Q', scale=alt.Scale(scheme='yelloworangered', reverse=False),
                                    legend=alt.Legend(orient='top', titleFontSize=14, labelFontSize=12, symbolSize=15, title='Count by Department'))
                ).properties(
                    width=700,
                    height=550
                )
                map = alt.layer(blank_map_chart, map_chart).resolve_scale(color='independent')
                combined_chart = alt.hconcat(map, alt.vconcat(male_chart, female_chart))
                display(combined_chart)
            else:
                print("No data available for this selection.")
                combined_chart = alt.hconcat(blank_map_chart, alt.vconcat(male_chart, female_chart))
                display(combined_chart)
        except Exception as e:
            print(f"An error occurred: {e}")

# Widgets setup
gender_dropdown = widgets.Dropdown(options=names['gender'].unique().tolist(), value='Male', description='Gender:')
name_dropdown = widgets.Dropdown(options=names['preusuel'].unique().tolist(), value='EGOR', description='Name:')
dpt_dropdown = widgets.Dropdown(options=names['department'].unique().tolist(), value='Nord', description='Department:')
decade_slider = widgets.SelectionSlider(
    options=['1900-1909', '1910-1919', '1920-1929', '1930-1939', '1940-1949', '1950-1959',
             '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019'],
    value='1900-1909',
    description='Decade:',
    continuous_update=False
)
play = widgets.Play(value=1900, min=1900, max=2010, step=1, interval=100, description="Press play")
output = widgets.Output()

# Link play widget to the decade slider
def on_play_change(change):
    year = change.new
    decade_string = f"{year}-{year+9}"
    if decade_string in decade_slider.options:
        decade_slider.value = decade_string
play.observe(on_play_change, names='value')

# Display widgets
widgets_display = widgets.HBox([gender_dropdown, name_dropdown, dpt_dropdown, decade_slider, play])
display(widgets_display, output)

# Set observers
gender_dropdown.observe(update_names_dropdown, names='value')
name_dropdown.observe(update_map, names='value')
dpt_dropdown.observe(update_map, names='value')
decade_slider.observe(update_map, names='value')
play.observe(on_play_change, names='value')


HBox(children=(Dropdown(description='Gender:', options=('Male', 'Female'), value='Male'), Dropdown(description…

Output()