In [2]:
import numpy as np
import pandas as pd

## Hierarchical indexing

Representing high dimensions in a low dimensional form.

### Series

In [3]:
ser = pd.Series(np.random.standard_normal(12),
                index=[['a'] * 4 + ['b'] * 2 + ['c'] * 6,
                       [1, 2, 3, 4, 1, 2, 1, 3, 5, 7, 9, 11]])
ser

a  1    -1.078094
   2    -0.748275
   3    -0.580208
   4     1.927731
b  1     1.151096
   2    -0.439126
c  1    -1.271905
   3    -0.338001
   5     0.603146
   7    -0.733407
   9    -1.483821
   11   -1.424818
dtype: float64

Internal representation of index:

In [4]:
ser.index

MultiIndex([('a',  1),
            ('a',  2),
            ('a',  3),
            ('a',  4),
            ('b',  1),
            ('b',  2),
            ('c',  1),
            ('c',  3),
            ('c',  5),
            ('c',  7),
            ('c',  9),
            ('c', 11)],
           )

In [5]:
ser.loc['b']

1    1.151096
2   -0.439126
dtype: float64

In [6]:
ser['a':'b']

a  1   -1.078094
   2   -0.748275
   3   -0.580208
   4    1.927731
b  1    1.151096
   2   -0.439126
dtype: float64

In [7]:
ser[:, 3]

a   -0.580208
c   -0.338001
dtype: float64

Turning into a dataframe by unstacking, which can also be stacked.

In [8]:
ser.unstack()

Unnamed: 0,1,2,3,4,5,7,9,11
a,-1.078094,-0.748275,-0.580208,1.927731,,,,
b,1.151096,-0.439126,,,,,,
c,-1.271905,,-0.338001,,0.603146,-0.733407,-1.483821,-1.424818


In [9]:
ser.unstack().stack()

a  1    -1.078094
   2    -0.748275
   3    -0.580208
   4     1.927731
b  1     1.151096
   2    -0.439126
c  1    -1.271905
   3    -0.338001
   5     0.603146
   7    -0.733407
   9    -1.483821
   11   -1.424818
dtype: float64

### DataFrame

In [10]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=[['a', 'a', 'b', 'b'], [0, 1, 1, 3]],
                  columns=[['Bacon', 'Bacon', 'Eggs', 'Eggs'], ['Piggy', 'Puppy', 'Piggy', 'Kitty']])
df.index.names = ['1st', '2nd']
df.columns.names = ['Food', 'Customer']
df

Unnamed: 0_level_0,Food,Bacon,Bacon,Eggs,Eggs
Unnamed: 0_level_1,Customer,Piggy,Puppy,Piggy,Kitty
1st,2nd,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,0,0,1,2,3
a,1,4,5,6,7
b,1,8,9,10,11
b,3,12,13,14,15


In [11]:
df.columns.nlevels, df.index.nlevels

(2, 2)

In [12]:
df['Bacon']

Unnamed: 0_level_0,Customer,Piggy,Puppy
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,0,1
a,1,4,5
b,1,8,9
b,3,12,13


In [13]:
df.loc['a']

Food,Bacon,Bacon,Eggs,Eggs
Customer,Piggy,Puppy,Piggy,Kitty
2nd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,1,2,3
1,4,5,6,7


This is another way of creating a multi index.

In [14]:
pd.MultiIndex.from_arrays([['Bacon', 'Bacon', 'Eggs', 'Eggs'],
                          ['Piggy', 'Puppy', 'Piggy', 'Kitty']],
                          names=['Food', 'Customer'])

MultiIndex([('Bacon', 'Piggy'),
            ('Bacon', 'Puppy'),
            ( 'Eggs', 'Piggy'),
            ( 'Eggs', 'Kitty')],
           names=['Food', 'Customer'])

In [15]:
df.columns

MultiIndex([('Bacon', 'Piggy'),
            ('Bacon', 'Puppy'),
            ( 'Eggs', 'Piggy'),
            ( 'Eggs', 'Kitty')],
           names=['Food', 'Customer'])

### Reordering and Sorting levels

In [16]:
df.swaplevel()

Unnamed: 0_level_0,Food,Bacon,Bacon,Eggs,Eggs
Unnamed: 0_level_1,Customer,Piggy,Puppy,Piggy,Kitty
2nd,1st,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,a,0,1,2,3
1,a,4,5,6,7
1,b,8,9,10,11
3,b,12,13,14,15


Level is for index and axis is axis.

In [17]:
df.sort_index(level=1, ascending=False, axis=1)

Unnamed: 0_level_0,Food,Bacon,Eggs,Bacon,Eggs
Unnamed: 0_level_1,Customer,Puppy,Piggy,Piggy,Kitty
1st,2nd,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,0,1,2,0,3
a,1,5,6,4,7
b,1,9,10,8,11
b,3,13,14,12,15


Selection performance is better on `sort_index()` dataframes.

### Index to Column to Index 

Column can become an index with `set_index`.

In [18]:
df2 = df['Bacon']
df2

Unnamed: 0_level_0,Customer,Piggy,Puppy
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,0,1
a,1,4,5
b,1,8,9
b,3,12,13


In [19]:
df2.set_index(['Piggy'])

Customer,Puppy
Piggy,Unnamed: 1_level_1
0,1
4,5
8,9
12,13


In [20]:
df2.set_index(['Piggy'], drop=False)

Customer,Piggy,Puppy
Piggy,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1
4,4,5
8,8,9
12,12,13


Or remove all the index by `reset_index`

In [21]:
df2.reset_index()

Customer,1st,2nd,Piggy,Puppy
0,a,0,0,1
1,a,1,4,5
2,b,1,8,9
3,b,3,12,13


## Combining and Merging datasets

### Database-Style DataFrame Joins

`pandas.merge` is a database-style join. It can perform one-to-one, one-to-many, and many-to-many operations.

There are couple different ways to join them using `how`:
  - `left`: SQL left outer join.
  - `right`: SQL right outer join.
  - `outer`: SQL full outer join. Keys sorted lexicographically.
  - `inner`: SQL inner join.
  - `cross`: cartesian product from both frames. Left keys' orders are saved.

In [22]:
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
                    "data1": pd.Series(range(7), dtype="Int64")})

df2 = pd.DataFrame({"key": ["a", "b", "d"],
                    "data2": pd.Series(range(3), dtype="Int64")})

df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [23]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


By default, it performs inner join, but other options are also available.

In [24]:
pd.merge(df1, df2, on='key', how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [25]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [26]:
pd.merge(df1, df2, on='key', how='right')

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,4.0,0
2,a,5.0,0
3,b,0.0,1
4,b,1.0,1
5,b,6.0,1
6,d,,2


In [27]:
pd.merge(df1, df2, on='key', how='outer')

Unnamed: 0,key,data1,data2
0,a,2.0,0.0
1,a,4.0,0.0
2,a,5.0,0.0
3,b,0.0,1.0
4,b,1.0,1.0
5,b,6.0,1.0
6,c,3.0,
7,d,,2.0


For each dataframe, column names might be different. Instead of changing names to match, we can specify the keys from each dataframe.

In [28]:
df3 = pd.DataFrame({"lkey": ["b", "b", "a", "c", "a", "a", "b"],
                    "data1": pd.Series(range(7), dtype="Int64")})

df4 = pd.DataFrame({"rkey": ["a", "b", "d"],
                    "data2": pd.Series(range(3), dtype="Int64")})
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [29]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [30]:
pd.merge(df3, df4, left_on="lkey", right_on="rkey")

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,a,2,a,0
3,a,4,a,0
4,a,5,a,0
5,b,6,b,1


We can join on multiple keys from each dataframe.

In [31]:
left = pd.DataFrame({"key1": ["foo", "foo", "bar"],
                     "key2": ["one", "two", "one"],
                     "lval": pd.Series([1, 2, 3], dtype='Int64')})

right = pd.DataFrame({"key1": ["foo", "foo", "bar", "bar"],
                      "key2": ["one", "one", "one", "two"],
                      "rval": pd.Series([4, 5, 6, 7], dtype='Int64')})

left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [32]:
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [33]:
pd.merge(left, right, on=["key1", "key2"], how="outer")

Unnamed: 0,key1,key2,lval,rval
0,bar,one,3.0,6.0
1,bar,two,,7.0
2,foo,one,1.0,4.0
3,foo,one,1.0,5.0
4,foo,two,2.0,


Key names can also get suffixes. This is useful when column names overlap.

In [34]:
pd.merge(left, right, on="key1", suffixes=("_left", "_right"))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### Merging on Index

Some data may have index that could be used as keys for merging. We can set `right_index` or `left_index` to `True`.

In [35]:
left1 = pd.DataFrame({"key": ["a", "b", "a", "a", "b", "c"],
                      "value": pd.Series(range(6), dtype="Int64")})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [36]:
right1 = pd.DataFrame({"group_val": [3.5, 7]}, index=["a", "b"])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [37]:
pd.merge(left1, right1, left_on="key", right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0


Hierarchically indexed data can also be merged easily.

In [38]:
lefth = pd.DataFrame({"key1": ["Ohio", "Ohio", "Ohio",
                               "Nevada", "Nevada"],
                      "key2": [2000, 2001, 2002, 2001, 2002],
                      "data": pd.Series(range(5), dtype="Int64")})
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0
1,Ohio,2001,1
2,Ohio,2002,2
3,Nevada,2001,3
4,Nevada,2002,4


In [39]:
righth_index = pd.MultiIndex.from_arrays(
    [
        ["Nevada", "Nevada", "Ohio", "Ohio", "Ohio", "Ohio"],
        [2001, 2000, 2000, 2000, 2001, 2002]
    ]
)
righth_index

MultiIndex([('Nevada', 2001),
            ('Nevada', 2000),
            (  'Ohio', 2000),
            (  'Ohio', 2000),
            (  'Ohio', 2001),
            (  'Ohio', 2002)],
           )

In [40]:
righth = pd.DataFrame({"event1": pd.Series([0, 2, 4, 6, 8, 10], dtype="Int64",
                                           index=righth_index),
                       "event2": pd.Series([1, 3, 5, 7, 9, 11], dtype="Int64",
                                           index=righth_index)})
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [41]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0,4,5
0,Ohio,2000,0,6,7
1,Ohio,2001,1,8,9
2,Ohio,2002,2,10,11
3,Nevada,2001,3,0,1


### Join

`pd.DataFrame.join` is like a simpler version of `pd.merge`. It automatically left joins on index.

In [42]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=["a", "c", "e"],
                     columns=["Ohio", "Nevada"]).astype("Int64")
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [43]:
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=["b", "c", "d", "e"],
                      columns=["Missouri", "Alabama"]).astype("Int64")
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [44]:
left2.join(right2, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


Same thing with `pd.merge`:

In [45]:
pd.merge(left2, right2, how="outer", left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


We can join multiple dataframes with a list of dataframes.

In [46]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=["a", "c", "e", "f"],
                       columns=["New York", "Oregon"])

another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [52]:
left2.join(right2)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
c,3,4,9,10
e,5,6,13,14


In [47]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7.0,8.0
c,3,4,9.0,10.0,9.0,10.0
e,5,6,13.0,14.0,11.0,12.0


In [48]:
left2.join([right2, another], how="outer")

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


### Concatenating Along an Axis

We can also concatenate or stack along axis.

In [53]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [54]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [55]:
np.concatenate([arr, arr], axis=0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])