In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [6]:
score = DataFrame(data = np.random.randint(0,100,size=(5,3)),
                  index=['tom','jack','mery','lucy','lili'],
                  columns=['python','java','php'])

In [9]:
score

Unnamed: 0,python,java,php
tom,88,61,18
jack,16,88,25
mery,2,50,83
lucy,37,98,7
lili,98,17,72


In [11]:
score.loc['tom','java'] = None
score

Unnamed: 0,python,java,php
tom,88,,18
jack,16,88.0,25
mery,2,50.0,83
lucy,37,98.0,7
lili,98,17.0,72


In [12]:
# pandas的聚合操作会自动优化空值
# pandas是一种更贴合业务需求的数据结构
score.sum()

python    241.0
java      253.0
php       205.0
dtype: float64

In [20]:
score2 = np.random.randint(0,100,size=(5,3))
score2.sum(axis=0)

array([269, 305, 309])

In [24]:
score3 = np.float32(score2)

In [26]:
score3[0,0] = np.nan

In [28]:
score3.sum()

nan

In [29]:
np.nansum(score3)

842.0

In [31]:
score.values

array([[88., nan, 18.],
       [16., 88., 25.],
       [ 2., 50., 83.],
       [37., 98.,  7.],
       [98., 17., 72.]])

In [33]:
table1 = pd.read_excel('关系表.xlsx',sheet_name=1)
table2 = pd.read_excel('关系表.xlsx',sheet_name=2)
table3 = pd.read_excel('关系表.xlsx',sheet_name=3)
table4 = pd.read_excel('关系表.xlsx',sheet_name=4)

In [34]:
table1

Unnamed: 0,手机型号,参考价格
0,windowsPhone,2500
1,iPhone,7500
2,Android,4000


In [35]:
table2

Unnamed: 0,手机型号,重量
0,windowsPhone,0.5
1,iPhone,0.4
2,Android,0.45
3,other,0.6


In [37]:
table3

Unnamed: 0,经销商,发货地区,手机型号
0,dancer,beijing,iPhone
1,lucy,beijing,Android
2,tom,guangzhou,iPhone
3,petter,shenzhen,windowsPhone
4,mery,guangzhou,Android


In [38]:
table4

Unnamed: 0,发货地区,手机型号,价格
0,beijing,iPhone,7000
1,beijing,windowsPhone,2300
2,beijing,Android,3600
3,guangzhou,iPhone,7600
4,guangzhou,windowsPhone,2800
5,guangzhou,Android,4200
6,shenzhen,iPhone,7400
7,shenzhen,windowsPhone,2750
8,shenzhen,Android,3900


In [39]:
pd.merge(table1,table2)

Unnamed: 0,手机型号,参考价格,重量
0,windowsPhone,2500,0.5
1,iPhone,7500,0.4
2,Android,4000,0.45


In [40]:
# 一对多合并
pd.merge(table1,table3)

Unnamed: 0,手机型号,参考价格,经销商,发货地区
0,windowsPhone,2500,petter,shenzhen
1,iPhone,7500,dancer,beijing
2,iPhone,7500,tom,guangzhou
3,Android,4000,lucy,beijing
4,Android,4000,mery,guangzhou


In [43]:
pd.merge(table3,table4,on='手机型号',suffixes=['_A','_B'])

Unnamed: 0,经销商,发货地区_A,手机型号,发货地区_B,价格
0,dancer,beijing,iPhone,beijing,7000
1,dancer,beijing,iPhone,guangzhou,7600
2,dancer,beijing,iPhone,shenzhen,7400
3,tom,guangzhou,iPhone,beijing,7000
4,tom,guangzhou,iPhone,guangzhou,7600
5,tom,guangzhou,iPhone,shenzhen,7400
6,lucy,beijing,Android,beijing,3600
7,lucy,beijing,Android,guangzhou,4200
8,lucy,beijing,Android,shenzhen,3900
9,mery,guangzhou,Android,beijing,3600


In [None]:
replace 替换值
rename  热换索引
map     映射一行或一列，通常是一列

In [45]:
table5 = table1.rename(columns={'手机型号':'型号'})

In [47]:
display(table2,table5)

Unnamed: 0,手机型号,重量
0,windowsPhone,0.5
1,iPhone,0.4
2,Android,0.45
3,other,0.6


Unnamed: 0,型号,参考价格
0,windowsPhone,2500
1,iPhone,7500
2,Android,4000


In [48]:
pd.merge(table5,table2,left_on='型号',right_on='手机型号')

Unnamed: 0,型号,参考价格,手机型号,重量
0,windowsPhone,2500,windowsPhone,0.5
1,iPhone,7500,iPhone,0.4
2,Android,4000,Android,0.45


In [50]:
table6 = table1.set_index('手机型号')

In [58]:
table2

Unnamed: 0,手机型号,重量
0,windowsPhone,0.5
1,iPhone,0.4
2,Android,0.45
3,other,0.6


In [59]:
table6

Unnamed: 0_level_0,参考价格
手机型号,Unnamed: 1_level_1
windowsPhone,2500
iPhone,7500
Android,4000


In [60]:
# right_index bool 设置是否参考索引进行合并
pd.merge(table2,table6,right_index=True,left_on='手机型号')

Unnamed: 0,手机型号,重量,参考价格
0,windowsPhone,0.5,2500
1,iPhone,0.4,7500
2,Android,0.45,4000


In [63]:
# 级联
df1 = DataFrame(data=np.random.randint(0,100,size=(3,3)),
                index=list('012'),columns=list('ABC'))
df1

Unnamed: 0,A,B,C
0,48,88,12
1,30,79,74
2,44,71,91


In [64]:
df2 = DataFrame(data=np.random.randint(0,100,size=(5,5)),
                index=list('01234'),columns=list('ABCDE'))
df2

Unnamed: 0,A,B,C,D,E
0,30,60,57,73,50
1,77,4,97,5,85
2,53,61,95,93,67
3,94,33,21,23,33
4,71,30,91,65,88


In [72]:
pd.concat((df1,df2),join='outer',axis=1,join_axes=[pd.Index(['1','2'])])

Unnamed: 0,A,B,C,A.1,B.1,C.1,D,E
1,30,79,74,77,4,97,5,85
2,44,71,91,53,61,95,93,67


In [74]:
pd.concat((df1,df2),join='outer',axis=1,join_axes=[df2.index])

Unnamed: 0,A,B,C,A.1,B.1,C.1,D,E
0,48.0,88.0,12.0,30,60,57,73,50
1,30.0,79.0,74.0,77,4,97,5,85
2,44.0,71.0,91.0,53,61,95,93,67
3,,,,94,33,21,23,33
4,,,,71,30,91,65,88


In [75]:
# 尽量不要破坏列索引
pd.concat((df1,df2),join='outer',axis=1,join_axes=[df2.index],ignore_index=True)

Unnamed: 0,0,1,2,3,4,5,6,7
0,48.0,88.0,12.0,30,60,57,73,50
1,30.0,79.0,74.0,77,4,97,5,85
2,44.0,71.0,91.0,53,61,95,93,67
3,,,,94,33,21,23,33
4,,,,71,30,91,65,88


In [76]:
pd.concat((df1,df2),join='outer',axis=1,join_axes=[df2.index],keys=['上学期','下学期'])

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期,下学期,下学期
Unnamed: 0_level_1,A,B,C,A,B,C,D,E
0,48.0,88.0,12.0,30,60,57,73,50
1,30.0,79.0,74.0,77,4,97,5,85
2,44.0,71.0,91.0,53,61,95,93,67
3,,,,94,33,21,23,33
4,,,,71,30,91,65,88


# 总结·

级联参数
1. axis 轴向 0~1
2. join 级联方式 inner  outer
3. join_axes [pd.Index([index1,index2....])] 指定索引为连接对象
4. ignore_index True\False 开启是否忽略索引
5. keys 设置级联后分区的名称

合并参数
1. on  label，[label1,label2...] 设置合并参考列
2. left_on\right_on  label分别设置左右两个表的合并参考列，一般用于不存在相同列标签的情况
3. left_index\right_index  True,False 分别设置是否开启以左\右行索引作为合并参考列
4. how 指定合并方式 inner outer left right
5. suffiex [suffiex1,suffiex2] 设置没有参与合并的相同的列标签的后缀