In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
""" 8.1 Hierarchical Indexing """

' 8.1 Hierarchical Indexing '

In [4]:
# avoir plusieurs niveaux d'index sur un axe
data = pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])

In [5]:
data

a  1    0.966055
   2   -0.111385
   3    1.160299
b  1   -1.042903
   3    0.337478
c  1    0.088853
   2    0.951830
d  2    0.195947
   3    1.712529
dtype: float64

In [6]:
#Details des niveaux
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [7]:
#selectionner un subset des données
data['b']

1   -1.042903
3    0.337478
dtype: float64

In [8]:
data['b':'c']

b  1   -1.042903
   3    0.337478
c  1    0.088853
   2    0.951830
dtype: float64

In [9]:
data.loc[['b','d']]

b  1   -1.042903
   3    0.337478
d  2    0.195947
   3    1.712529
dtype: float64

In [10]:
# selection à un niveau interne
data.loc[:,2]

a   -0.111385
c    0.951830
d    0.195947
dtype: float64

In [11]:
# Transformer les données en DataFrame
data.unstack()

Unnamed: 0,1,2,3
a,0.966055,-0.111385,1.160299
b,-1.042903,,0.337478
c,0.088853,0.95183,
d,,0.195947,1.712529


In [12]:
# ou l'inverse 'stack()'
data.unstack().stack()

a  1    0.966055
   2   -0.111385
   3    1.160299
b  1   -1.042903
   3    0.337478
c  1    0.088853
   2    0.951830
d  2    0.195947
   3    1.712529
dtype: float64

In [13]:
# Avec un DataFrame les 2 axes peuvent avoir un hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4,3)), index=[['a','a','b','b'],[1,2,1,2]], columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])

In [14]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [15]:
# il est possible de nommer les index hierarchiques
frame.index.names = ['key1','key2']

In [16]:
frame.columns.names = ['state','color']

In [17]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [18]:
#selectionner un subset
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [19]:
""" Reordering and Sorting Levels """

' Reordering and Sorting Levels '

In [20]:
# permuter 2 niveaux d'index
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [21]:
#ordonner les données selon l'ordre d'un niveau
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [22]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [23]:
""" Summary Statistics by level """

' Summary Statistics by level '

In [24]:
#somme par niveau d'index
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [25]:
frame.sum(level='color',axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [26]:
""" Indexing with a Dataframe Columns"""

' Indexing with a Dataframe Columns'

In [27]:
frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})

In [28]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [29]:
#creer un nouveau dataframe en utilisant les colonnes comme index
frame2 = frame.set_index(['c','d'])

In [30]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [31]:
#Par défaut les colonnes sont retirées, mais on peut les garder
frame.set_index(['c','d'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [32]:
#placer les index hierarchiques dans les colonnes
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [33]:
""" 8.2 Combining and Merging Datasets"""

' 8.2 Combining and Merging Datasets'

In [34]:
"""      Database-Style Dataframe joins """

'      Database-Style Dataframe joins '

In [35]:
df1 =  pd.DataFrame({'key':['b','b','a','c','a','a','b'], 'data1' : range(7)})

In [36]:
df2 = pd.DataFrame({'key':['a','b','d'],'data2' : range(3)})

In [37]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [38]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [39]:
# join de type : many-to-one
pd.merge(df1,df2)#applique la jointure sur la colonne en commun par défaut

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [40]:
# il est de bonne pratique d'expliciter le point de jointure
pd.merge(df1,df2, on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [41]:
# si les noms de colonnes sont différents dans chaque objet il faut les spécifier séparemment

In [42]:
df3 =  pd.DataFrame({'lkey':['b','b','a','c','a','a','b'], 'data1' : range(7)})

In [43]:
df4 = pd.DataFrame({'rkey':['a','b','d'],'data2' : range(3)})

In [44]:
pd.merge(df3, df4, left_on='lkey',right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [45]:
# Par défaut merge() effectue un 'inner join'; 'c' et 'd' sont ignorés

In [46]:
#effectuer un merge par union (outer join)
pd.merge(df1,df2,how='outer')

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [47]:
#merge many-to-many
df1 = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})

In [48]:
df2 = pd.DataFrame({'key':['a','b','a','b','d'], 'data2':range(5)})

In [49]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [50]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,a
3,3,b
4,4,d


In [51]:
# renvoie un produit cartesien des lignes
pd.merge(df1,df2,on='key',how='left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,0,b,3.0
2,1,b,1.0
3,1,b,3.0
4,2,a,0.0
5,2,a,2.0
6,3,c,
7,4,a,0.0
8,4,a,2.0
9,5,b,1.0


In [52]:
# many-to-many par intersection
pd.merge(df1,df2,on='key',how='inner')

Unnamed: 0,data1,key,data2
0,0,b,1
1,0,b,3
2,1,b,1
3,1,b,3
4,5,b,1
5,5,b,3
6,2,a,0
7,2,a,2
8,4,a,0
9,4,a,2


In [53]:
# merge avec plusieurs clés
left = pd.DataFrame({'key1':['foo','foo','bar'],
                     'key2':['one','two','one'],
                     'lval':[1,2,3]})

In [54]:
right = pd.DataFrame({'key1':['foo','foo','bar','bar'],
                     'key2':['one','one','one','two'],
                     'rval':[4,5,6,7]})

In [55]:
pd.merge(left,right,on=['key1','key2'],how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [56]:
#colonnes overlapping
pd.merge(left,right,on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [57]:
#utiliser des suffixes
pd.merge(left,right,on='key1',suffixes=('_left','_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [58]:
""" Merging on index"""

' Merging on index'

In [59]:
left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})

In [60]:
right1 = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])

In [61]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [62]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [63]:
#utiliser l'index comme clé de jointure
pd.merge(left1, right1, left_on='key',right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [64]:
#version Union 
pd.merge(left1, right1, left_on='key',right_index=True,how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [65]:
""" Concatener le long d'un axis"""

" Concatener le long d'un axis"

In [66]:
# concatenation Numpy

In [67]:
arr = np.arange(12).reshape((3,4))

In [68]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [69]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [70]:
np.concatenate([arr,arr],axis=0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [71]:
#Concatenation Pandas

In [72]:
s1 = pd.Series([0,1],index=['a','b'])

In [73]:
s2 = pd.Series([2,3,4],index=['c','d','e'])

In [74]:
s3 = pd.Series([5,6],index=['f','g'])

In [75]:
#concatener les valeurs et les index
pd.concat([s1,s2,s3])# par défaut .concat() fonctionne sur axis=0

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [76]:
#sur axis=1 génère un Dataframe 
pd.concat([s1,s2,s3],axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [77]:
#---

In [78]:
s4 = pd.concat([s1,s3])

In [79]:
s4

a    0
b    1
f    5
g    6
dtype: int64

In [80]:
pd.concat([s1,s4],axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [81]:
pd.concat([s1,s4],axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [82]:
#On peut tout de même spécifier les axes à utiliser
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,1.0
e,,


In [83]:
#identifier les elts concatenés
result = pd.concat([s1,s2,s3],keys=['one','two','three'])

In [84]:
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [85]:
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


In [86]:
# sur l'autre axe
pd.concat([s1,s2,s3],keys=['one','two','three'],axis=1)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [87]:
# On utilise la même logique pour les DataFrames
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])

In [88]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])

In [89]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [90]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [91]:
pd.concat([df1,df2],axis=1,keys=['level1','level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [92]:
""" Combining Data with overlap """

' Combining Data with overlap '

In [93]:
a= pd.Series([np.nan,2.5,0.0,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])

In [94]:
b = pd.Series([0.,np.nan,2.,np.nan,np.nan,5.],index=['a','b','c','d','e','f'])

In [95]:
#remplir les données sans laisser de nan
b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64

In [96]:
df1 = pd.DataFrame({'a':[1.,np.nan,5.,np.nan],
                    'b':[np.nan, 2., np.nan, 6.],
                    'c' : range(2,18,4)})

In [97]:
df2 = pd.DataFrame({'a':[5.,4.,np.nan,3.,7.],
                    'b':[np.nan, 3.,4.,6.,8.]})

In [98]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,
