In [44]:
import pandas as pd
import numpy as np

In [57]:
df = pd.read_csv("./data/large_countries_2015.csv", sep=",", index_col=0)

In [48]:
df.index.name = 'country'

In [49]:
df.reset_index(inplace=True)

In [58]:
df

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia


In [51]:
df["population"].mean()

375346161.6666667

In [52]:
df["fertility"].mean()

2.4374999999999996

In [39]:
df["continent"].value_counts()

Asia             7
North America    2
South America    1
Africa           1
Europe           1
Name: continent, dtype: int64

### 1.11.1 Examples

In [59]:
df['population'] = df['population'] / 1000000
df['population'] = round(df['population'], 1)

In [60]:
df

Unnamed: 0,population,fertility,continent
Bangladesh,161.0,2.12,Asia
Brazil,207.8,1.78,South America
China,1376.0,1.57,Asia
India,1311.1,2.43,Asia
Indonesia,257.6,2.28,Asia
Japan,126.6,1.45,Asia
Mexico,127.0,2.13,North America
Nigeria,182.2,5.89,Africa
Pakistan,188.9,3.04,Asia
Philippines,100.7,2.98,Asia


In [61]:
# Calculate the average population size of the large countries
average_population = df['population'].mean()

In [62]:
average_population

375.34999999999997

In [54]:
# Calculate the average population size by continent
avg_pop_per_continent = round(df.groupby('continent')['population'].mean(), 2)

In [63]:
avg_pop_per_continent

continent
Africa           182.20
Asia             503.13
Europe           143.50
North America    224.40
South America    207.80
Name: population, dtype: float64

### 1.11.2 Split

In [64]:
# 1. by column
g1 = df.groupby('continent')
g1.groups

{'Africa': ['Nigeria'], 'Asia': ['Bangladesh', 'China', 'India', 'Indonesia', 'Japan', 'Pakistan', 'Philippines'], 'Europe': ['Russia'], 'North America': ['Mexico', 'United States'], 'South America': ['Brazil']}

In [65]:
# 2. by an array of equal length
industrialized = np.array([False, True, True, True, False, True, True, False, False, False, True, True])
g2 = df.groupby(industrialized)
g2.groups

{False: ['Bangladesh', 'Indonesia', 'Nigeria', 'Pakistan', 'Philippines'], True: ['Brazil', 'China', 'India', 'Japan', 'Mexico', 'Russia', 'United States']}

In [66]:
# 3. by a Dictionary with keys on the Index
language = {'Bangladesh':'BN', 'Brazil':'PT', 'China':'CN',
            'India':'BN', 'Indonesia':'MS', 'Japan':'JP',
            'Mexico':'ES', 'Nigeria':'NG', 'Pakistan':'UR',
            'Philippines':'PP', 'Russia':'RU', 'United States':'EN'}
g3 = df.groupby(language)
g3.groups

{'BN': ['Bangladesh', 'India'], 'CN': ['China'], 'EN': ['United States'], 'ES': ['Mexico'], 'JP': ['Japan'], 'MS': ['Indonesia'], 'NG': ['Nigeria'], 'PP': ['Philippines'], 'PT': ['Brazil'], 'RU': ['Russia'], 'UR': ['Pakistan']}

In [67]:
# 4. by a function
g4 = df.groupby(len)
g4.groups

{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [68]:
# 5. a list of the above
g5 = df.groupby(['continent', language, len])
g5.groups

{('Africa', 'NG', 7): ['Nigeria'], ('Asia', 'BN', 5): ['India'], ('Asia', 'BN', 10): ['Bangladesh'], ('Asia', 'CN', 5): ['China'], ('Asia', 'JP', 5): ['Japan'], ('Asia', 'MS', 9): ['Indonesia'], ('Asia', 'PP', 11): ['Philippines'], ('Asia', 'UR', 8): ['Pakistan'], ('Europe', 'RU', 6): ['Russia'], ('North America', 'EN', 13): ['United States'], ('North America', 'ES', 6): ['Mexico'], ('South America', 'PT', 6): ['Brazil']}

In [69]:
# 6. group along the x-axis
g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)
g6.groups

{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [73]:
for i, df_group in df.groupby('continent'):
    print(i, df_group, '\n')

Africa          population  fertility continent
Nigeria       182.2       5.89    Africa 

Asia              population  fertility continent
Bangladesh        161.0       2.12      Asia
China            1376.0       1.57      Asia
India            1311.1       2.43      Asia
Indonesia         257.6       2.28      Asia
Japan             126.6       1.45      Asia
Pakistan          188.9       3.04      Asia
Philippines       100.7       2.98      Asia 

Europe         population  fertility continent
Russia       143.5       1.61    Europe 

North America                population  fertility      continent
Mexico              127.0       2.13  North America
United States       321.8       1.97  North America 

South America         population  fertility      continent
Brazil       207.8       1.78  South America 



### 1.11.3 Apply

In [74]:
g = df.groupby('continent')

In [75]:
# standard aggregation functions
g.mean()
g.max()
g.min()
g.sum()
g.count()
g.std()
g.median()
g.quantile(0.9)
g.describe()

# Aggregation with selecting columns
g['population'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,1.0,182.2,,182.2,182.2,182.2,182.2,182.2
Asia,7.0,503.128571,576.55886,100.7,143.8,188.9,784.35,1376.0
Europe,1.0,143.5,,143.5,143.5,143.5,143.5,143.5
North America,2.0,224.4,137.744401,127.0,175.7,224.4,273.1,321.8
South America,1.0,207.8,,207.8,207.8,207.8,207.8,207.8


In [76]:
# Aggregation with a list of function names
g.agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,population,population,population,fertility,fertility,fertility
Unnamed: 0_level_1,count,mean,std,count,mean,std
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Africa,1,182.2,,1,5.89,
Asia,7,503.128571,576.55886,7,2.267143,0.620154
Europe,1,143.5,,1,1.61,
North America,2,224.4,137.744401,2,2.05,0.113137
South America,1,207.8,,1,1.78,


In [83]:
g.agg([('Total', 'sum')])        # includes label

Unnamed: 0_level_0,population,fertility
Unnamed: 0_level_1,Total,Total
continent,Unnamed: 1_level_2,Unnamed: 2_level_2
Africa,182.2,5.89
Asia,3521.9,15.87
Europe,143.5,1.61
North America,448.8,4.1
South America,207.8,1.78


In [84]:
# custom aggregation function with parameter
def sum_greater(dataframe, threshold):
    for column in dataframe.columns:
        return dataframe[dataframe[column]>threshold].sum()
    
g.agg(sum_greater, threshold=200)

Unnamed: 0_level_0,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.0,0.0
Asia,2944.7,6.28
Europe,0.0,0.0
North America,321.8,1.97
South America,207.8,1.78


In [85]:
# Transformation by function name
g.transform('mean')

Unnamed: 0,population,fertility
Bangladesh,503.128571,2.267143
Brazil,207.8,1.78
China,503.128571,2.267143
India,503.128571,2.267143
Indonesia,503.128571,2.267143
Japan,503.128571,2.267143
Mexico,224.4,2.05
Nigeria,182.2,5.89
Pakistan,503.128571,2.267143
Philippines,503.128571,2.267143


In [86]:
# Transformation by function reference
g.transform(len)

Unnamed: 0,population,fertility
Bangladesh,7,7
Brazil,1,1
China,7,7
India,7,7
Indonesia,7,7
Japan,7,7
Mexico,2,2
Nigeria,1,1
Pakistan,7,7
Philippines,7,7


In [87]:
# Transformation with your own function
def normalize(array):
    return array - array.mean()

g.transform(normalize)

Unnamed: 0,population,fertility
Bangladesh,-342.128571,-0.147143
Brazil,0.0,0.0
China,872.871429,-0.697143
India,807.971429,0.162857
Indonesia,-245.528571,0.012857
Japan,-376.528571,-0.817143
Mexico,-97.4,0.08
Nigeria,0.0,0.0
Pakistan,-314.228571,0.772857
Philippines,-402.428571,0.712857


In [88]:
# apply any function
def first_two(df):
    return df.head(2)

g.apply(first_two)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,fertility,continent
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Nigeria,182.2,5.89,Africa
Asia,Bangladesh,161.0,2.12,Asia
Asia,China,1376.0,1.57,Asia
Europe,Russia,143.5,1.61,Europe
North America,Mexico,127.0,2.13,North America
North America,United States,321.8,1.97,North America
South America,Brazil,207.8,1.78,South America


## Recap: Solve with One-Liners

In [131]:
# Read the file ../data/penguins_simple.csv
df = pd.read_csv("./data/penguins_simple.csv", sep=";")
df

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Adelie,39.1,18.7,181.0,3750.0,MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,36.7,19.3,193.0,3450.0,FEMALE
4,Adelie,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...
328,Gentoo,47.2,13.7,214.0,4925.0,FEMALE
329,Gentoo,46.8,14.3,215.0,4850.0,FEMALE
330,Gentoo,50.4,15.7,222.0,5750.0,MALE
331,Gentoo,45.2,14.8,212.0,5200.0,FEMALE


In [132]:
def clean_column_names(cols):
    return cols.str.replace('(', '').str.replace(')', '').str.replace(' ', '_').str.lower()

df.columns = clean_column_names(df.columns)

  return cols.str.replace('(', '').str.replace(')', '').str.replace(' ', '_').str.lower()


In [133]:
# Who is the 7th penguin?
df[6:7]

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
6,Adelie,39.2,19.6,195.0,4675.0,MALE


In [134]:
# How many Gentoos are in the dataset?
df[df["species"] == "Gentoo"].shape

(119, 6)

In [135]:
# Who is the heaviest penguin
df.sort_values("body_mass_g", ascending=False)[0:1]

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
231,Gentoo,49.2,15.2,221.0,6300.0,MALE


In [137]:
# What is the mean size of female penguins?
df[df["sex"] == "FEMALE"]["body_mass_g"].mean()

3862.2727272727275

In [155]:
# How many penguins are heavier than 3kg?
# df[df["body_mass_g" > 3000]].len(df.index)
df[df["body_mass_g"] > 3000].shape

(323, 6)

In [158]:
# Select 3 random penguins.
df.sample(3)

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
63,Adelie,41.8,19.4,198.0,4450.0,MALE
186,Chinstrap,49.0,19.5,210.0,3950.0,MALE
214,Gentoo,46.1,13.2,211.0,4500.0,FEMALE
