# Visualization n° 1

## Loading the data

In [1]:
import altair as alt
import os
import pandas as pd
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
alt.data_transformers.enable('json') # Let Altair/Vega-Lite work with large data sets

pass

In [2]:
names = pd.read_csv("dpt2020.csv", sep=";")
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)

names.sample(5)

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
3242575,2,MIREILLE,1959,12,12
1589263,1,THOMAS,1979,84,35
1729602,1,ZLATAN,2020,67,3
3338525,2,NOLWEN,1982,29,3
318233,1,CLAUDE,1978,38,14


In [3]:
depts = gpd.read_file('departements-version-simplifiee.geojson')

## Data processing and Visualization

In [4]:
grouped = names.groupby(['annais', 'preusuel'], as_index=False).sum()
grouped

Unnamed: 0,annais,preusuel,sexe,nombre
0,1900,ABEL,51,382
1,1900,ABRAHAM,2,9
2,1900,ACHILLE,18,152
3,1900,ACHILLES,1,4
4,1900,ADAM,1,9
...,...,...,...,...
249121,2020,ÉVA,60,156
249122,2020,ÉVAN,16,62
249123,2020,ÉZIO,4,12
249124,2020,ÉZÉCHIEL,3,11


In [5]:
top_5_names = grouped.groupby('preusuel')["nombre"].sum().sort_values()[::-1][:5].index.tolist()
top_6_10_names = grouped.groupby('preusuel')["nombre"].sum().sort_values()[::-1][5:10].index.tolist()
worst_5_names = grouped.groupby('preusuel')["nombre"].sum().sort_values()[:5].index.tolist()
print("Top 5 names from 1900 to 2019 : \n{}".format(top_5_names))
print("Top 6-10 names from 1900 to 2019 : \n{}".format(top_6_10_names))
print("Worst 5 names : ", worst_5_names)

Top 5 names from 1900 to 2019 : 
['MARIE', 'JEAN', 'PIERRE', 'MICHEL', 'ANDRÉ']
Top 6-10 names from 1900 to 2019 : 
['JEANNE', 'PHILIPPE', 'LOUIS', 'RENÉ', 'ALAIN']
Worst 5 names :  ['MEDEA', 'CHRISTOPH', 'LÉNORA', 'LÉNO', 'CHRISTOS']


In [6]:
subset1 = grouped.loc[grouped["preusuel"].apply(lambda x : any(x == name for name in top_5_names))]
subset2 = grouped.loc[grouped["preusuel"].apply(lambda x : any(x == name for name in top_6_10_names))]
subset3 = grouped.loc[grouped["preusuel"].apply(lambda x : any(x == name for name in worst_5_names))]

In [7]:
base1 = alt.Chart(subset1).mark_area(opacity=0.3).encode(
    x=alt.X("annais:T", title='Year'),
    y=alt.Y("nombre:Q", stack=None, title='Number of times the name was given'), 
    color=alt.Color("preusuel:N", title = "Top 10 names")
    ).properties(width=800, height=400
    )

base2 = alt.Chart(subset2).mark_area(opacity=0.3).encode(
    x=alt.X("annais:T", title='Year'),
    y=alt.Y("nombre:Q", stack=None, title='Number of times the name was given'), 
    color=alt.Color("preusuel:N")
    ).properties(width=800, height=400
    )

base3 = alt.Chart(subset3).mark_bar(opacity=0.3).encode(
    x=alt.X("annais:T", title='Year'),
    y=alt.Y("nombre:Q", stack=None, title='Number of times the name was given'), 
    color=alt.Color("preusuel:N", title = "Worst 5 names")
    ).properties(width=800, height=400
    )

In [8]:
base1 & base2

In [9]:
base3

*NB*: To see the charts, it is necessary to download the notebook and run the code (charts are note available directly from github.)

*Answers and Interpretation* : We notice that the first names tend to evolve in a bell shape. They gradually increase until they reach a maximum value before gradually decreasing in the same way. Most of the names do not have a constant popularity as explained before. Some first names also never had success as we can see on the graph 3 like Léno or Médea. 

We can also see that some first names have known a great popularity in a brief way as Philippe between 1955 and 1970. We can also notice that this same first name is absolutely not more popular since about twenty years. Finally, we can observe a trend in the time which seems to be polynomial and approximately of degree 2 for all the first names. 