## 五、合并数据

* 构造一个生产DataFrame的函数

In [9]:
import pandas as pd
import numpy as np

def make_df(cols, ind):
    "一个简单的DataFrame"
    data = {c: [str(c)+str(i) for i in ind]  for c in cols}
    return pd.DataFrame(data, ind)

make_df("ABC", range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


* 垂直合并

In [35]:
df_1 = make_df("AB", [1, 2])
df_2 = make_df("AB", [3, 4])
print(df_1)
print(df_2)

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4


In [36]:
pd.concat([df_1, df_2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


* 水平合并

In [37]:
df_3 = make_df("AB", [0, 1])
df_4 = make_df("CD", [0, 1])
print(df_3)
print(df_4)

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1


In [51]:
pd.concat([df_3, df_4], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


* 索引重叠

行重叠

In [39]:
df_5 = make_df("AB", [1, 2])
df_6 = make_df("AB", [1, 2])
print(df_5)
print(df_6)

    A   B
1  A1  B1
2  A2  B2
    A   B
1  A1  B1
2  A2  B2


In [40]:
pd.concat([df_5, df_6])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
1,A1,B1
2,A2,B2


In [41]:
pd.concat([df_5, df_6],ignore_index=True)

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
2,A1,B1
3,A2,B2


列重叠

In [10]:
df_7 = make_df("ABC", [1, 2])
df_8 = make_df("BCD", [1, 2])
print(df_7)
print(df_8)

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
1  B1  C1  D1
2  B2  C2  D2


In [11]:
pd.concat([df_7, df_8], axis=1)

Unnamed: 0,A,B,C,B.1,C.1,D
1,A1,B1,C1,B1,C1,D1
2,A2,B2,C2,B2,C2,D2


In [12]:
pd.concat([df_7, df_8],axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
1,A1,B1,C1,B1,C1,D1
2,A2,B2,C2,B2,C2,D2


* 对齐合并merge()

In [61]:
df_9 = make_df("AB", [1, 2])
df_10 = make_df("BC", [1, 2])
print(df_9)
print(df_10)

    A   B
1  A1  B1
2  A2  B2
    B   C
1  B1  C1
2  B2  C2


In [62]:
pd.merge(df_9, df_10)

Unnamed: 0,A,B,C
0,A1,B1,C1
1,A2,B2,C2


In [65]:
df_9 = make_df("AB", [1, 2])
df_10 = make_df("CB", [2, 1])
print(df_9)
print(df_10)

    A   B
1  A1  B1
2  A2  B2
    C   B
2  C2  B2
1  C1  B1


In [67]:
pd.merge(df_9, df_10)

Unnamed: 0,A,B,C
0,A1,B1,C1
1,A2,B2,C2


【例】 合并城市信息

In [77]:
population_dict = {"city": ("BeiJing", "HangZhou", "ShenZhen"),
                   "pop": (2154, 981, 1303)}
population = pd.DataFrame(population_dict)
population

Unnamed: 0,city,pop
0,BeiJing,2154
1,HangZhou,981
2,ShenZhen,1303


In [82]:
GDP_dict = {"city": ("BeiJing", "ShangHai", "HangZhou"),
            "GDP": (30320, 32680, 13468)}
GDP = pd.DataFrame(GDP_dict)
GDP

Unnamed: 0,city,GDP
0,BeiJing,30320
1,ShangHai,32680
2,HangZhou,13468


In [86]:
city_info = pd.merge(population, GDP)
city_info

Unnamed: 0,city,pop,GDP
0,BeiJing,2154,30320
1,HangZhou,981,13468


In [87]:
city_info = pd.merge(population, GDP, how="outer")
city_info

Unnamed: 0,city,pop,GDP
0,BeiJing,2154.0,30320.0
1,HangZhou,981.0,13468.0
2,ShenZhen,1303.0,
3,ShangHai,,32680.0
