# 数据连接 merge

In [2]:
import pandas as pd
import numpy as np

### Inner, Outer, Left, Right Join

In [3]:
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2' : np.random.randint(0,10,3)})

print(df_obj1)
print(df_obj2)

   data1 key
0      7   b
1      2   b
2      1   a
3      5   c
4      8   a
5      8   a
6      8   b
   data2 key
0      5   a
1      8   b
2      8   d


In [4]:
# 默认将重叠列的列名作为“外键”进行连接

pd.merge(df_obj1, df_obj2)

Unnamed: 0,data1,key,data2
0,7,b,8
1,2,b,8
2,8,b,8
3,1,a,5
4,8,a,5
5,8,a,5


In [5]:
# on显示指定“外键”

pd.merge(df_obj1, df_obj2, on='key')

Unnamed: 0,data1,key,data2
0,7,b,8
1,2,b,8
2,8,b,8
3,1,a,5
4,8,a,5
5,8,a,5


In [6]:
# 更改列名，导致没有相同的key

df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})

In [7]:
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')

Unnamed: 0,data1,key1,data2,key2
0,7,b,8,b
1,2,b,8,b
2,8,b,8,b
3,1,a,5,a
4,8,a,5,a
5,8,a,5,a


In [8]:
# “外连接” -- 并集

pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')

Unnamed: 0,data1,key1,data2,key2
0,7.0,b,8.0,b
1,2.0,b,8.0,b
2,8.0,b,8.0,b
3,1.0,a,5.0,a
4,8.0,a,5.0,a
5,8.0,a,5.0,a
6,5.0,c,,
7,,,8.0,d


In [9]:
# 左连接

pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')

Unnamed: 0,data1,key1,data2,key2
0,7,b,8.0,b
1,2,b,8.0,b
2,1,a,5.0,a
3,5,c,,
4,8,a,5.0,a
5,8,a,5.0,a
6,8,b,8.0,b


In [11]:
# 右连接

pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')

Unnamed: 0,data1,key1,data2,key2
0,7.0,b,8,b
1,2.0,b,8,b
2,8.0,b,8,b
3,1.0,a,5,a
4,8.0,a,5,a
5,8.0,a,5,a
6,,,8,d


### 处理重复列名

In [12]:

df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data' : np.random.randint(0,10,3)})

pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))

Unnamed: 0,data_left,key,data_right
0,6,b,4
1,9,b,4
2,9,b,4
3,3,a,3
4,2,a,3
5,4,a,3


In [13]:
# 按索引连接

df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])

In [13]:
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)

Unnamed: 0,data1,key,data2
0,6,b,5
1,7,b,5
6,9,b,5
2,2,a,3
4,4,a,3
5,4,a,3
