In [1]:
import numpy as np
import pandas as pd

In [2]:
# dataset: fuente original -> https://data.worldbank.org/indicator/SP.POP.TOTL
# dataset: fuente donde ya ha sido procesado -> https://drive.google.com/file/d/1Iy245CZoPnaVuslJLvHTgE45nIWvKHG9/view?usp=sharing
dataset_path = "/home/david/ML_DL/datasets/population total - The World Bank/poblacion.csv"

In [3]:
df_pop = pd.read_csv(dataset_path)

In [4]:
df_pop

Unnamed: 0,Country,year,pop
0,Afghanistan,2015,3.441360e+07
1,Albania,2015,2.880703e+06
2,Algeria,2015,3.972802e+07
3,American Samoa,2015,5.581200e+04
4,Andorra,2015,7.801100e+04
...,...,...,...
1035,Pre-demographic dividend,2018,9.194854e+08
1036,Small states,2018,4.057532e+07
1037,South Asia,2018,1.814389e+09
1038,South Asia (IDA & IBRD),2018,1.814389e+09


In [11]:
# formato numérico de población más entendible
pd.options.display.float_format = "{:,.1f}".format

In [12]:
df_pop

Unnamed: 0,Country,year,pop
0,Afghanistan,2015,34413603.0
1,Albania,2015,2880703.0
2,Algeria,2015,39728025.0
3,American Samoa,2015,55812.0
4,Andorra,2015,78011.0
...,...,...,...
1035,Pre-demographic dividend,2018,919485393.0
1036,Small states,2018,40575321.0
1037,South Asia,2018,1814388744.0
1038,South Asia (IDA & IBRD),2018,1814388744.0


In [13]:
# convertir el año a variable categórica
df_pop["year"] = pd.Categorical(df_pop["year"].apply(str))
df_pop["year"]

0       2015
1       2015
2       2015
3       2015
4       2015
        ... 
1035    2018
1036    2018
1037    2018
1038    2018
1039    2018
Name: year, Length: 1040, dtype: category
Categories (4, object): ['2015', '2016', '2017', '2018']

In [14]:
df_pop.dtypes

Country      object
year       category
pop         float64
dtype: object

In [15]:
# filtro para seleccionar dos países, se puede ver como una máscara
idx_country_filter = df_pop["Country"].isin(["Aruba", "Colombia"])
idx_country_filter

0       False
1       False
2       False
3       False
4       False
        ...  
1035    False
1036    False
1037    False
1038    False
1039    False
Name: Country, Length: 1040, dtype: bool

In [16]:
# obtener muestra usando el filtro
df_sample = df_pop[idx_country_filter]
df_sample

Unnamed: 0,Country,year,pop
9,Aruba,2015,104341.0
42,Colombia,2015,47520667.0
269,Aruba,2016,104872.0
302,Colombia,2016,48171392.0
529,Aruba,2017,105366.0
562,Colombia,2017,48901066.0
789,Aruba,2018,105845.0
822,Colombia,2018,49648685.0


In [17]:
# cambiar los índices del dataframe, esto fácilita algunos análisis
df_sample = df_sample.set_index(["Country", "year"]).sort_index()
df_sample

Unnamed: 0_level_0,Unnamed: 1_level_0,pop
Country,year,Unnamed: 2_level_1
Aruba,2015,104341.0
Aruba,2016,104872.0
Aruba,2017,105366.0
Aruba,2018,105845.0
Colombia,2015,47520667.0
Colombia,2016,48171392.0
Colombia,2017,48901066.0
Colombia,2018,49648685.0


In [24]:
# solo ver la parte de Colombia, : indica que queremos los datos de todos los años
df_sample.loc["Colombia", :]

Unnamed: 0_level_0,pop
year,Unnamed: 1_level_1
2015,47520667.0
2016,48171392.0
2017,48901066.0
2018,49648685.0


In [27]:
# ver un dato usando múltiples índices
df_sample.xs(("Aruba", "2018"))

pop   105,845.0
Name: (Aruba, 2018), dtype: float64

In [28]:
# checar datos usando el nivel de índice para filtrar
df_sample.xs("2018", level="year")

Unnamed: 0_level_0,pop
Country,Unnamed: 1_level_1
Aruba,105845.0
Colombia,49648685.0


In [32]:
df_pop["Country"].unique().shape

(260,)

In [34]:
# Aplicar lo de los múltiples índices a todo el dataframe
df_countries = df_pop.set_index(["Country", "year"]).sort_index(ascending=[True, True])
df_countries

Unnamed: 0_level_0,Unnamed: 1_level_0,pop
Country,year,Unnamed: 2_level_1
Afghanistan,2015,34413603.0
Afghanistan,2016,35383128.0
Afghanistan,2017,36296400.0
Afghanistan,2018,37172386.0
Albania,2015,2880703.0
...,...,...
Zambia,2018,17351822.0
Zimbabwe,2015,13814629.0
Zimbabwe,2016,14030390.0
Zimbabwe,2017,14236745.0


In [35]:
# seleccionar una parte del dataset
ids_f = pd.IndexSlice
df_countries.loc[ids_f["Aruba":"Austria", "2015":"2017"], :].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,pop
Country,year,Unnamed: 2_level_1
Aruba,2015,104341.0
Aruba,2016,104872.0
Aruba,2017,105366.0
Australia,2015,23815995.0
Australia,2016,24190907.0
Australia,2017,24601860.0
Austria,2015,8642699.0
Austria,2016,8736668.0
Austria,2017,8797566.0


In [36]:
# obtener los valores de índices del primer nivel
df_countries.index.get_level_values(level=0)

Index(['Afghanistan', 'Afghanistan', 'Afghanistan', 'Afghanistan', 'Albania',
       'Albania', 'Albania', 'Albania', 'Algeria', 'Algeria',
       ...
       'Yemen, Rep.', 'Yemen, Rep.', 'Zambia', 'Zambia', 'Zambia', 'Zambia',
       'Zimbabwe', 'Zimbabwe', 'Zimbabwe', 'Zimbabwe'],
      dtype='object', name='Country', length=1040)

In [38]:
# seleccionar datos en específico a partir de la columna
country = "Colombia"
df_countries["pop"][country]

year
2015   47,520,667.0
2016   48,171,392.0
2017   48,901,066.0
2018   49,648,685.0
Name: pop, dtype: float64

In [40]:
# aplicando una función matemática a un nivel de índice específicado
df_countries.groupby(level=1).sum()

Unnamed: 0_level_0,pop
year,Unnamed: 1_level_1
2015,65679147019.0
2016,66487930677.0
2017,67294176701.0
2018,68087886692.0


In [44]:
df_countries.groupby(level=0).mean().sort_values(by='pop', ascending=False)

Unnamed: 0_level_0,pop
Country,Unnamed: 1_level_1
IDA & IBRD total,6295313187.2
Low & middle income,6266873327.0
Middle income,5587775965.2
IBRD only,4708714616.8
Early-demographic dividend,3186078338.8
...,...
Palau,17776.2
Nauru,12776.0
Tuvalu,11300.5
Eritrea,


In [48]:
# volver el índice year a columna de nuevo
df_sample = df_sample.unstack("year")
df_sample

Unnamed: 0_level_0,pop,pop,pop,pop
year,2015,2016,2017,2018
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Aruba,104341.0,104872.0,105366.0,105845.0
Colombia,47520667.0,48171392.0,48901066.0,49648685.0
