In [1]:
import pandas as pd
import numpy as np

### merge dataframes by columns

In [47]:
df1 = pd.DataFrame({'key': list('bbacaab'), 'data1': range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [49]:
df2 = pd.DataFrame({'key': list('abd'), 'data2': range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [50]:
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [53]:
df1.columns=['key1', 'data1']
df1

Unnamed: 0,key1,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [54]:
pd.merge(df1, df2, left_on='key1', right_on='key')

Unnamed: 0,key1,data1,key,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [55]:
df1.columns = ['key', 'data1']
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [56]:
af1 = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)})
af1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [57]:
af2 = pd.DataFrame({'key': list('ababd'), 'data2': range(5)})
af2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [58]:
pd.merge(af1, af2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


## merge dataframes by index (row)

In [59]:
left1 = pd.DataFrame({'key': list('abaabc'), 'value': range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [60]:
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [63]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [64]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


Using the index to join is more complicated with hierarchical indexing/multiindex

In [65]:
lefth = pd.DataFrame({
    'key1': 'ohio,ohio,ohio,nevada,nevada'.split(','),
    'key2': [2000, 2001, 2002, 2001, 2002],
    'data': np.arange(5.)
})
lefth

Unnamed: 0,key1,key2,data
0,ohio,2000,0.0
1,ohio,2001,1.0
2,ohio,2002,2.0
3,nevada,2001,3.0
4,nevada,2002,4.0


In [66]:
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=['nevada,nevada,ohio,ohio,ohio,ohio'.split(','),
                             [2001, 2000, 2000, 2000, 2001, 2002]])
righth

Unnamed: 0,Unnamed: 1,0,1
nevada,2001,0,1
nevada,2000,2,3
ohio,2000,4,5
ohio,2000,6,7
ohio,2001,8,9
ohio,2002,10,11


In [67]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,key1,key2,data,0,1
0,ohio,2000,0.0,4,5
0,ohio,2000,0.0,6,7
1,ohio,2001,1.0,8,9
2,ohio,2002,2.0,10,11
3,nevada,2001,3.0,0,1


### Concatenating along an axis

In [68]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [69]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [70]:
np.concatenate([arr, arr])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [72]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s1

a    0
b    1
dtype: int64

In [73]:
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s2

c    2
d    3
e    4
dtype: int64

In [74]:
s3 = pd.Series([5, 6], index=['f', 'g'])
s3

f    5
g    6
dtype: int64

In [75]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

Concatenating along the colums produces a dataframe - dimension increases

In [76]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [77]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [78]:
pd.concat([s1, s4], axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [80]:
pd.concat([s1, s4], axis=1, join='outer')

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [83]:
scat = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
scat

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [84]:
scat.index

MultiIndex([(  'one', 'a'),
            (  'one', 'b'),
            (  'two', 'a'),
            (  'two', 'b'),
            ('three', 'f'),
            ('three', 'g')],
           )

In [85]:
scat.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [86]:
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


Applying concat to DataFrame

In [88]:
bf1 = pd.DataFrame(np.arange(6).reshape((3, 2)), index=['a', 'b', 'c'], columns=['one', 'two'])
bf1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [91]:
bf2 = pd.DataFrame(5 + np.arange(4).reshape((2, 2)), index=['a', 'c'], columns=['three', 'four'])
bf2

Unnamed: 0,three,four
a,5,6
c,7,8


Concatenation with axis=1 will add additional columns, joining on the index. Adding keys creates a hierarchical column index.

In [92]:
pd.concat([bf1, bf2], axis=1, keys=['left', 'right'])

Unnamed: 0_level_0,left,left,right,right
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


Can be just as easily done by passing a dictionary of frames.

In [94]:
pd.concat({'left': bf1, 'right': bf2}, axis=1)

Unnamed: 0_level_0,left,left,right,right
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


Below the index is not that important but the shape

In [95]:
ef1 = pd.DataFrame(np.random.rand(3, 4), columns=['a', 'b', 'c', 'd'])
ef1

Unnamed: 0,a,b,c,d
0,0.310619,0.001551,0.369297,0.909171
1,0.02565,0.586633,0.255232,0.382965
2,0.975301,0.862708,0.165969,0.927394


In [96]:
ef2 = pd.DataFrame(np.random.randn(2, 3), columns=list('bda'))
ef2

Unnamed: 0,b,d,a
0,0.31284,0.328919,-0.069365
1,0.893547,0.722481,1.479387


In [97]:
pd.concat([ef1, ef2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.310619,0.001551,0.369297,0.909171
1,0.02565,0.586633,0.255232,0.382965
2,0.975301,0.862708,0.165969,0.927394
3,-0.069365,0.31284,,0.328919
4,1.479387,0.893547,,0.722481


### Combining data with overlap

This cannot be expressed as a concatenation or merge, because you would end-up with multiple columns and have to process it afterwards. For large data sets, combine_first shown below saves data processing.

In [98]:
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan], index=list('abcdef'))
a

a    NaN
b    2.5
c    0.0
d    3.5
e    4.5
f    NaN
dtype: float64

In [99]:
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.], index=list('abcdef'))
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [100]:
b.combine_first(a)

a    0.0
b    2.5
c    2.0
d    3.5
e    4.5
f    5.0
dtype: float64

Also applies to DataFrame column-wise.