# Funciones de agregación y análisis de datos

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('data/countries.csv')

In [2]:
df["Region"] = df["Region"].str.strip()

# Calc total population
total_population = df["Population"].sum()

Cálculo manual de proporciones de tipos de valores.

In [None]:
# Subset for Region EASTERN EUROPE, calc total Population
estearn_europe_pop = df[df["Region"] == "EASTERN EUROPE"]["Population"].sum()

# Subset for Region NORTHERN AFRICA, calc total Population
northern_africa_pop = df[df["Region"] == "NORTHERN AFRICA"]["Population"].sum()

# Subset for Region OCEANIA, calc total Population
oceania_pop = df[df["Region"] == "OCEANIA"]["Population"].sum()

# Get proportion for each Region
pop_by_region = [estearn_europe_pop, northern_africa_pop, oceania_pop] / total_population
pop_by_region

Cálculo de proporciones usando agrupación.

In [None]:
# Group by Region; calc total Population
grouped_pop_by_region = df.groupby("Region")["Population"].sum()

# Get proportion for each Region
pop_by_region = grouped_pop_by_region / grouped_pop_by_region.sum()
pop_by_region

In [None]:
# Group by Region and Climate; calc total Population
pop_by_region_and_climate = df.groupby(["Region", "Climate"])["Population"].sum()
pop_by_region_and_climate

In [None]:
# For each Region, aggregate GDP ($ per capita): get min, max, mean, and median
gdp_stats = df.groupby('Region')['GDP ($ per capita)'].agg([np.min, np.max, np.mean, np.median])

# Print gdp_stats
gdp_stats

In [None]:
# For each Region, aggregate GDP ($ per capita) and Area (sq. mi.): get min, max, mean, and median
gdp_area_stats = df.groupby('Region')[['GDP ($ per capita)', 'Area (sq. mi.)']].agg([np.min, np.max, np.mean, np.median])

# Print gdp_area_stats
gdp_area_stats

## Pivot Table

Con .pivot_table() se puede crear una tabla de valores medios por categoría.

In [None]:
# Pivot for mean GDP ($ per capita) for each Region
mean_GDP_by_region = df.pivot_table(values='GDP ($ per capita)', index='Region')

# Print mean_GDP_by_region
mean_GDP_by_region

Adicional al valor medio que viene por defecto, también se pueden agregar otras funciones de agregación.

In [None]:
# Pivot for mean and median GDP ($ per capita) for Region
mean_med_GDP_by_region = df.pivot_table(values='GDP ($ per capita)', index='Region', aggfunc=[np.mean, np.median])

# Print mean_med_sales_by_type
mean_med_GDP_by_region

Usando el parámetro columns se puede agrega una agrupación adicional.

In [None]:
# Pivot for mean GDP ($ per capita) by Region and Climate 
mean_GDP_by_region_climate = df.pivot_table(values='GDP ($ per capita)', index='Region', columns='Climate')

# Print mean_GDP_by_region_climate
mean_GDP_by_region_climate

Los valores nulos en las columnas se pueden tratar usando el parámetro fill_value, de esta manera estos se actualizan asignandoles un valor.

In [None]:
# Pivot for mean GDP ($ per capita) by Region and Climate, fill missing values with 0
mean_GDP_by_region_climate = df.pivot_table(values='GDP ($ per capita)', index='Region', columns='Climate', fill_value=0)

# Print mean_GDP_by_region_climate
mean_GDP_by_region_climate

Para agregar totales por filas o columnas se define el valor del parámetro margins en True.

In [None]:
# Pivot for mean GDP ($ per capita) by Region and Climate, fill missing values with 0; sum all rows and cols
mean_GDP_by_region_climate = df.pivot_table(values='GDP ($ per capita)', index='Region', columns='Climate', fill_value=0, margins=True, aggfunc=[np.sum])

# Print mean_GDP_by_region_climate
mean_GDP_by_region_climate