In [4]:
!pip install --no-input pandas



In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("./data/large_countries_2015.csv", sep=",", index_col=0)

In [7]:
df.index.name = 'country'

In [8]:
df.reset_index(inplace=True)

In [9]:
df

Unnamed: 0,country,population,fertility,continent
0,Bangladesh,160995600.0,2.12,Asia
1,Brazil,207847500.0,1.78,South America
2,China,1376049000.0,1.57,Asia
3,India,1311051000.0,2.43,Asia
4,Indonesia,257563800.0,2.28,Asia
5,Japan,126573500.0,1.45,Asia
6,Mexico,127017200.0,2.13,North America
7,Nigeria,182202000.0,5.89,Africa
8,Pakistan,188924900.0,3.04,Asia
9,Philippines,100699400.0,2.98,Asia


In [10]:
df["population"].mean()

375346161.6666667

In [11]:
df["fertility"].mean()

2.4374999999999996

In [12]:
df["continent"].value_counts()

Asia             7
North America    2
South America    1
Africa           1
Europe           1
Name: continent, dtype: int64

### 1.11.1 Examples

In [13]:
df['population'] = df['population'] / 1000000
df['population'] = round(df['population'], 1)

In [14]:
df

Unnamed: 0,country,population,fertility,continent
0,Bangladesh,161.0,2.12,Asia
1,Brazil,207.8,1.78,South America
2,China,1376.0,1.57,Asia
3,India,1311.1,2.43,Asia
4,Indonesia,257.6,2.28,Asia
5,Japan,126.6,1.45,Asia
6,Mexico,127.0,2.13,North America
7,Nigeria,182.2,5.89,Africa
8,Pakistan,188.9,3.04,Asia
9,Philippines,100.7,2.98,Asia


In [15]:
# Calculate the average population size of the large countries
average_population = df['population'].mean()

In [16]:
average_population

375.34999999999997

In [17]:
# Calculate the average population size by continent
avg_pop_per_continent = round(df.groupby('continent')['population'].mean(), 2)

In [18]:
avg_pop_per_continent

continent
Africa           182.20
Asia             503.13
Europe           143.50
North America    224.40
South America    207.80
Name: population, dtype: float64

### 1.11.2 Split

In [19]:
# 1. by column
g1 = df.groupby('continent')
g1.groups

{'Africa': [7], 'Asia': [0, 2, 3, 4, 5, 8, 9], 'Europe': [10], 'North America': [6, 11], 'South America': [1]}

In [20]:
# 2. by an array of equal length
industrialized = np.array([False, True, True, True, False, True, True, False, False, False, True, True])
g2 = df.groupby(industrialized)
g2.groups

{False: [0, 4, 7, 8, 9], True: [1, 2, 3, 5, 6, 10, 11]}

In [21]:
# 3. by a Dictionary with keys on the Index
language = {'Bangladesh':'BN', 'Brazil':'PT', 'China':'CN',
            'India':'BN', 'Indonesia':'MS', 'Japan':'JP',
            'Mexico':'ES', 'Nigeria':'NG', 'Pakistan':'UR',
            'Philippines':'PP', 'Russia':'RU', 'United States':'EN'}
g3 = df.groupby(language)
g3.groups

{}

In [22]:
# 4. by a function
g4 = df.groupby(len)
g4.groups

TypeError: object of type 'int' has no len()

In [None]:
# 5. a list of the above
g5 = df.groupby(['continent', language, len])
g5.groups

In [None]:
# 6. group along the x-axis
g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)
g6.groups

In [None]:
for i, df_group in df.groupby('continent'):
    print(i, df_group, '\n')

### 1.11.3 Apply

In [23]:
g = df.groupby('continent')

In [24]:
# standard aggregation functions
g.mean()
g.max()
g.min()
g.sum()
g.count()
g.std()
g.median()
g.quantile(0.9)
g.describe()

# Aggregation with selecting columns
g['population'].describe()

  g.quantile(0.9)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,1.0,182.2,,182.2,182.2,182.2,182.2,182.2
Asia,7.0,503.128571,576.55886,100.7,143.8,188.9,784.35,1376.0
Europe,1.0,143.5,,143.5,143.5,143.5,143.5,143.5
North America,2.0,224.4,137.744401,127.0,175.7,224.4,273.1,321.8
South America,1.0,207.8,,207.8,207.8,207.8,207.8,207.8


In [25]:
# Aggregation with a list of function names
g.agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,population,population,population,fertility,fertility,fertility
Unnamed: 0_level_1,count,mean,std,count,mean,std
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Africa,1,182.2,,1,5.89,
Asia,7,503.128571,576.55886,7,2.267143,0.620154
Europe,1,143.5,,1,1.61,
North America,2,224.4,137.744401,2,2.05,0.113137
South America,1,207.8,,1,1.78,


In [26]:
g.agg([('Total', 'sum')])        # includes label

Unnamed: 0_level_0,country,population,fertility
Unnamed: 0_level_1,Total,Total,Total
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Africa,Nigeria,182.2,5.89
Asia,BangladeshChinaIndiaIndonesiaJapanPakistanPhil...,3521.9,15.87
Europe,Russia,143.5,1.61
North America,MexicoUnited States,448.8,4.1
South America,Brazil,207.8,1.78


In [27]:
# custom aggregation function with parameter
def sum_greater(dataframe, threshold):
    for column in dataframe.columns:
        return dataframe[dataframe[column]>threshold].sum()
    
g.agg(sum_greater, threshold=200)

TypeError: '>' not supported between instances of 'str' and 'int'

In [28]:
# Transformation by function name
g.transform('mean')

  g.transform('mean')


Unnamed: 0,population,fertility
0,503.128571,2.267143
1,207.8,1.78
2,503.128571,2.267143
3,503.128571,2.267143
4,503.128571,2.267143
5,503.128571,2.267143
6,224.4,2.05
7,182.2,5.89
8,503.128571,2.267143
9,503.128571,2.267143


In [29]:
# Transformation by function reference
g.transform(len)

Unnamed: 0,country,population,fertility
0,7,7,7
1,1,1,1
2,7,7,7
3,7,7,7
4,7,7,7
5,7,7,7
6,2,2,2
7,1,1,1
8,7,7,7
9,7,7,7


In [30]:
# Transformation with your own function
def normalize(array):
    return array - array.mean()

g.transform(normalize)

  g.transform(normalize)


Unnamed: 0,population,fertility
0,-342.128571,-0.147143
1,0.0,0.0
2,872.871429,-0.697143
3,807.971429,0.162857
4,-245.528571,0.012857
5,-376.528571,-0.817143
6,-97.4,0.08
7,0.0,0.0
8,-314.228571,0.772857
9,-402.428571,0.712857


In [31]:
# apply any function
def first_two(df):
    return df.head(2)

g.apply(first_two)

Unnamed: 0_level_0,Unnamed: 1_level_0,country,population,fertility,continent
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,7,Nigeria,182.2,5.89,Africa
Asia,0,Bangladesh,161.0,2.12,Asia
Asia,2,China,1376.0,1.57,Asia
Europe,10,Russia,143.5,1.61,Europe
North America,6,Mexico,127.0,2.13,North America
North America,11,United States,321.8,1.97,North America
South America,1,Brazil,207.8,1.78,South America


## Recap: Solve with One-Liners

In [32]:
# Read the file ../data/penguins_simple.csv
df = pd.read_csv("./data/penguins_simple.csv", sep=";")
df

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Adelie,39.1,18.7,181.0,3750.0,MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,36.7,19.3,193.0,3450.0,FEMALE
4,Adelie,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...
328,Gentoo,47.2,13.7,214.0,4925.0,FEMALE
329,Gentoo,46.8,14.3,215.0,4850.0,FEMALE
330,Gentoo,50.4,15.7,222.0,5750.0,MALE
331,Gentoo,45.2,14.8,212.0,5200.0,FEMALE


In [33]:
def clean_column_names(cols):
    return cols.str.replace('(', '').str.replace(')', '').str.replace(' ', '_').str.lower()

df.columns = clean_column_names(df.columns)

  return cols.str.replace('(', '').str.replace(')', '').str.replace(' ', '_').str.lower()


In [34]:
# Who is the 7th penguin?
df[6:7]

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
6,Adelie,39.2,19.6,195.0,4675.0,MALE


In [35]:
# How many Gentoos are in the dataset?
df[df["species"] == "Gentoo"].shape

(119, 6)

In [36]:
# Who is the heaviest penguin
df.sort_values("body_mass_g", ascending=False)[0:1]

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
231,Gentoo,49.2,15.2,221.0,6300.0,MALE


In [37]:
# What is the mean size of female penguins?
df[df["sex"] == "FEMALE"]["body_mass_g"].mean()

3862.2727272727275

In [38]:
# How many penguins are heavier than 3kg?
# df[df["body_mass_g" > 3000]].len(df.index)
df[df["body_mass_g"] > 3000].shape

(323, 6)

In [39]:
# Select 3 random penguins.
df.sample(3)

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
46,Adelie,35.0,17.9,190.0,3450.0,FEMALE
81,Adelie,36.9,18.6,189.0,3500.0,FEMALE
110,Adelie,38.6,17.0,188.0,2900.0,FEMALE
