# Grouper un dataframe et manipuler les groupes

In [1]:
# packages à installer 
# !pip install pydataset

In [6]:
# Import package
import pandas as pd
import numpy as np

from pydataset import data

In [7]:
# Lecture du dataset 'iris'
iris_data = data('iris').reset_index(drop=True)

In [8]:
iris_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [9]:
iris_data.Species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

# Indexes des groupes

In [10]:
# groupby sur la table
iris_grouped = iris_data.groupby("Species")

In [11]:
# Indice des lignes et leur groupes
iris_grouped.indices

{'setosa': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       dtype=int64),
 'versicolor': array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
       dtype=int64),
 'virginica': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
        139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], dtype=int64)}

In [13]:
# sélectionner un groupe
iris_grouped.get_group('versicolor').shape

(50, 5)

In [14]:
iris_data.Species.value_counts()

setosa        50
versicolor    50
virginica     50
Name: Species, dtype: int64

# Appliquer des fonctions

## `apply()` function to groups

In [15]:
# compter le nombre d'observation dans chaque groupe
iris_grouped.apply(len)

Species
setosa        50
versicolor    50
virginica     50
dtype: int64

## `aggregate()` functon

In [16]:
# appliquer des fonctions d'aggrégation aux colonnes
iris_grouped.agg(["min", "max", "mean"])

Unnamed: 0_level_0,Sepal.Length,Sepal.Length,Sepal.Length,Sepal.Width,Sepal.Width,Sepal.Width,Petal.Length,Petal.Length,Petal.Length,Petal.Width,Petal.Width,Petal.Width
Unnamed: 0_level_1,min,max,mean,min,max,mean,min,max,mean,min,max,mean
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
setosa,4.3,5.8,5.006,2.3,4.4,3.428,1.0,1.9,1.462,0.1,0.6,0.246
versicolor,4.9,7.0,5.936,2.0,3.4,2.77,3.0,5.1,4.26,1.0,1.8,1.326
virginica,4.9,7.9,6.588,2.2,3.8,2.974,4.5,6.9,5.552,1.4,2.5,2.026


In [17]:
# appliquer différentes fonctions d'aggrégation aux colonnes
iris_grouped.agg({"Sepal.Length": ["min", "max", "mean"],
                  "Sepal.Width": ["median", "std"]
                 })

Unnamed: 0_level_0,Sepal.Length,Sepal.Length,Sepal.Length,Sepal.Width,Sepal.Width
Unnamed: 0_level_1,min,max,mean,median,std
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
setosa,4.3,5.8,5.006,3.4,0.379064
versicolor,4.9,7.0,5.936,2.8,0.313798
virginica,4.9,7.9,6.588,3.0,0.322497


## `transform()` function

In [18]:
# Transformation des colonnes par groupe
iris_grouped.transform(lambda x: x - x.min())

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,0.8,1.2,0.4,0.1
1,0.6,0.7,0.4,0.1
2,0.4,0.9,0.3,0.1
3,0.3,0.8,0.5,0.1
4,0.7,1.3,0.4,0.1
5,1.1,1.6,0.7,0.3
6,0.3,1.1,0.4,0.2
7,0.7,1.1,0.5,0.1
8,0.1,0.6,0.4,0.1
9,0.6,0.8,0.5,0.0


# Obtenir les statistiques sur les groupes

In [19]:
# nombre d'inidividus dans chaque groupe
iris_grouped.size() 

Species
setosa        50
versicolor    50
virginica     50
dtype: int64

In [20]:
# compter le nombre de valeurs non null par groupe et par variable
iris_grouped.count()

Unnamed: 0_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,50,50,50,50
versicolor,50,50,50,50
virginica,50,50,50,50


In [21]:
# récupérer l'entete de chaque groupe
iris_grouped.head(2)  

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
100,6.3,3.3,6.0,2.5,virginica
101,5.8,2.7,5.1,1.9,virginica


In [22]:
# pour les dernières valeurs il faut utiliser tail(n)
iris_grouped.tail(2) 

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
98,5.1,2.5,3.0,1.1,versicolor
99,5.7,2.8,4.1,1.3,versicolor
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [23]:
# maximum de chaque variable par groupe
iris_grouped.max()

Unnamed: 0_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [24]:
iris_grouped.min()
# mean, median, std, sum

Unnamed: 0_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,4.3,2.3,1.0,0.1
versicolor,4.9,2.0,3.0,1.0
virginica,4.9,2.2,4.5,1.4


In [25]:
# obtenir le numéro de groupe de chaque individu
iris_grouped.ngroup()

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     0
43     0
44     0
45     0
46     0
47     0
48     0
49     0
50     1
51     1
52     1
53     1
54     1
55     1
56     1
57     1
58     1
59     1
60     1
61     1
62     1
63     1
64     1
65     1
66     1
67     1
68     1
69     1
70     1
71     1
72     1
73     1
74     1
75     1
76     1
77     1
78     1
79     1
80     1
81     1
82     1
83     1
84     1
85     1
86     1
87     1
88     1
89     1
90     1
91     1
92     1
93     1
94     1
95     1
96     1
97     1
98     1
99     1
100    2
101    2
102    2
103    2
104    2
105    2
106    2
107    2
108    2
109    2
110    2
1

In [28]:
# récupérer le premier individu de chaque groupe
iris_grouped.last()

Unnamed: 0_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.0,3.3,1.4,0.2
versicolor,5.7,2.8,4.1,1.3
virginica,5.9,3.0,5.1,1.8


In [27]:
# récupérer le n-ième individu de chaque groupe
iris_grouped.nth(0) # récupérer le deuxième individu de chaque groupe

Unnamed: 0_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.1,3.5,1.4,0.2
versicolor,7.0,3.2,4.7,1.4
virginica,6.3,3.3,6.0,2.5


In [29]:
# matrice de corrélation par groupe
iris_grouped.corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
setosa,Sepal.Length,1.0,0.742547,0.267176,0.278098
setosa,Sepal.Width,0.742547,1.0,0.1777,0.232752
setosa,Petal.Length,0.267176,0.1777,1.0,0.33163
setosa,Petal.Width,0.278098,0.232752,0.33163,1.0
versicolor,Sepal.Length,1.0,0.525911,0.754049,0.546461
versicolor,Sepal.Width,0.525911,1.0,0.560522,0.663999
versicolor,Petal.Length,0.754049,0.560522,1.0,0.786668
versicolor,Petal.Width,0.546461,0.663999,0.786668,1.0
virginica,Sepal.Length,1.0,0.457228,0.864225,0.281108
virginica,Sepal.Width,0.457228,1.0,0.401045,0.537728


In [30]:
# tirer un échantillon de façon aléatoire dans chaque groupe
iris_grouped.sample(n=3, replace=False)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5,5.4,3.9,1.7,0.4,setosa
14,5.8,4.0,1.2,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
64,5.6,2.9,3.6,1.3,versicolor
71,6.1,2.8,4.0,1.3,versicolor
82,5.8,2.7,3.9,1.2,versicolor
122,7.7,2.8,6.7,2.0,virginica
123,6.3,2.7,4.9,1.8,virginica
131,7.9,3.8,6.4,2.0,virginica


# Ressources 

https://pandas.pydata.org/docs/reference/groupby.html