# Visualization n° 1

## Loading the data

In [44]:
import altair as alt
import os
import pandas as pd
import numpy as np
import random 
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
alt.data_transformers.enable('json') # Let Altair/Vega-Lite work with large data sets
alt.renderers.enable('default')

pass

In [45]:
names = pd.read_csv("dpt2020.csv", sep=";")
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)

names.sample(5)

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
577796,1,FRÉDÉRIC,1999,3,4
2217524,2,COLETTE,1957,54,40
2448874,2,FIONA,1993,49,7
1561390,1,TAYLOR,2013,69,4
837291,1,JEAN-PIERRE,1967,27,22


In [46]:
depts = gpd.read_file('departements-version-simplifiee.geojson')

## Data processing and Visualization

In [47]:
grouped = names.groupby(['annais', 'preusuel'], as_index=False).sum()
grouped

Unnamed: 0,annais,preusuel,sexe,nombre
0,1900,ABEL,51,382
1,1900,ABRAHAM,2,9
2,1900,ACHILLE,18,152
3,1900,ACHILLES,1,4
4,1900,ADAM,1,9
...,...,...,...,...
249121,2020,ÉVA,60,156
249122,2020,ÉVAN,16,62
249123,2020,ÉZIO,4,12
249124,2020,ÉZÉCHIEL,3,11


In [48]:
#Ordered list of names per popularity 
popularity_names_ordered = grouped.groupby('preusuel')["nombre"].sum().sort_values()[::-1].index.tolist()
top_10_names = popularity_names_ordered[:10]
#Random Sampling of 5 names among the remaining names to find less popular names
random.seed(30)
random_names = random.sample(popularity_names_ordered[10:], 5)

In [49]:
print("Top 10 names from 1900 to 2019 : \n{}".format(top_10_names))
print("Random names sampled from the remaining dataset: \n{}".format(random_names))

Top 10 names from 1900 to 2019 : 
['MARIE', 'JEAN', 'PIERRE', 'MICHEL', 'ANDRÉ', 'JEANNE', 'PHILIPPE', 'LOUIS', 'RENÉ', 'ALAIN']
Random names sampled from the remaining dataset: 
['ARIANNE', 'ROHIT', 'MARCELLO', 'CARMELIA', 'LOUANE']


In [50]:
#Extraction of time series for the selected names
subset1 = grouped.loc[grouped["preusuel"].apply(lambda x : any(x == name for name in top_10_names))]
subset2 = grouped.loc[grouped["preusuel"].apply(lambda x : any(x == name for name in random_names))]

In [51]:
subset_concat = pd.concat([subset1, subset2], axis = 0).sort_values("annais")

In [52]:
#Overview of the selected subset
subset_concat.head(5)

Unnamed: 0,annais,preusuel,sexe,nombre
22,1900,ALAIN,6,83
76,1900,ANDRÉ,95,5534
535,1900,JEAN,96,14100
543,1900,JEANNE,188,13981
613,1900,LOUIS,94,9052


**NB**: The visualization is interactive, so it is necessary to download the notebook and run the cells 

- *Precisions about the choice of this Visualization* : I chose to use a line chart because it is quite relevant in the context of time series. To avoid the spaghetti effect and to be able to clearly observe the evolution of the popularity of each name, I chose to make the visualization interactive. Finally, the choice of Altair seemed natural to me because it is easy to handle and integrates perfectly in jupyter notebooks.


- *Answers to questions about Visualization 1* : We notice that some names in the top 10 seem to evolve in a bell shape (ex : Alain, Philippe, Jean). They gradually increase until they reach a maximum value before gradually decreasing in the same way. Most of the names do not have a constant popularity as explained before. Some first names also never had success as we can see for Rohit or Carmelia. 
    
    We can also see that some first names have known a great popularity in a brief way as Philippe between 1955 and 1970. We can also notice that this same first name is absolutely not more popular since about twenty years. Finally, we can observe a trend in the time which seems to be linear and sometimes polynomial (deg 2) for Alain and Philippe for instance.

*Source* : 

Code inspired from : https://developers.google.com/earth-engine/tutorials/community/time-series-visualization-with-altair

In [74]:
highlight = alt.selection(
    type='single', on='mouseover', fields=['preusuel'], nearest=True)

base = alt.Chart(subset_concat, title="LineChart of the evolution of the popularity of the top 10 names + 5 random names").encode(
    x=alt.X('annais:T', scale=alt.Scale(clamp=True)),
    y=alt.Y('nombre:Q', scale=alt.Scale()),
    color=alt.Color('preusuel:N', scale=alt.Scale(scheme='magma')))

points = base.mark_circle().encode(
    opacity=alt.value(0),
    tooltip=[
        alt.Tooltip('preusuel:N', title='Prenoms'),
        alt.Tooltip('annais:T', title='Année'),
        alt.Tooltip('nombre:Q', title='Nombre')
    ]).add_selection(highlight)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3)))

(points + lines).properties(width=600, height=350).interactive()


![](LineChart_DataVis_1.png)
