In [12]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

两个主要的数据结构Series和DataFrame

# Series

类似于一维数组对象，由数据和索引组成，默认索引为0-（N-1）

In [4]:
obj = pd.Series([4,7,-5,3])

In [5]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj = Series([4,7,-5,3],index=['a','b','c','d'])

In [9]:
obj

a    4
b    7
c   -5
d    3
dtype: int64

In [10]:
obj[['a','d']]

a    4
d    3
dtype: int64

In [11]:
#可以直接对Series数组作数学运算
obj*2

a     8
b    14
c   -10
d     6
dtype: int64

In [13]:
np.exp(obj)

a      54.598150
b    1096.633158
c       0.006738
d      20.085537
dtype: float64

In [14]:
#通过字典直接创建Series
sdata = {'Ohio':35,"Texas":71,'Oregon':16,'Utah':50}
obj3 = pd.Series(sdata)
obj3

Ohio      35
Texas     71
Oregon    16
Utah      50
dtype: int64

In [15]:
#可以传入索引，来生成序列
states = ['C','Ohio','Utah','Texas']
obj4 = pd.Series(sdata,index=states)
obj4

C         NaN
Ohio     35.0
Utah     50.0
Texas    71.0
dtype: float64

In [16]:
#缺失数据
pd.isnull(obj4)

C         True
Ohio     False
Utah     False
Texas    False
dtype: bool

In [17]:
pd.notnull(obj4)

C        False
Ohio      True
Utah      True
Texas     True
dtype: bool

In [18]:
obj3+obj4

C           NaN
Ohio       70.0
Oregon      NaN
Texas     142.0
Utah      100.0
dtype: float64

In [21]:
#Series的索引可以根据赋值的方式更改
obj.index=['b','c','a','d']
obj

b    4
c    7
a   -5
d    3
dtype: int64

# DataFrame

表格型数据结构，具有行索引和列索引

In [22]:
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],'year':[2000,2001,2003,2001,2002,2003],'pop':[1,2,3,4,5,6]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1
1,Ohio,2001,2
2,Ohio,2003,3
3,Nevada,2001,4
4,Nevada,2002,5
5,Nevada,2003,6


In [23]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1
1,Ohio,2001,2
2,Ohio,2003,3
3,Nevada,2001,4
4,Nevada,2002,5


In [24]:
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'],index=['1','2','3','4','5','6'])
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1,
2,2001,Ohio,2,
3,2003,Ohio,3,
4,2001,Nevada,4,
5,2002,Nevada,5,
6,2003,Nevada,6,


In [25]:
frame.columns

Index(['state', 'year', 'pop'], dtype='object')

In [27]:
frame2['state']

1      Ohio
2      Ohio
3      Ohio
4    Nevada
5    Nevada
6    Nevada
Name: state, dtype: object

In [28]:
#frame.column只有在column名是合理的变量名时才适用

In [29]:
#行元素可用loc属性选取
frame2.loc['3']

year     2003
state    Ohio
pop         3
debt      NaN
Name: 3, dtype: object

In [30]:
#嵌套字典
#以外层键为列索引，内层键为行索引

# 索引对象

index对象不可变，在多个数据结构之间安全共享

# 5.2 基本功能

In [32]:
#重新索引
print(obj)
obj2 = obj.reindex(['a','b','c','d','e'])
obj2


b    4
c    7
a   -5
d    3
dtype: int64


a   -5.0
b    4.0
c    7.0
d    3.0
e    NaN
dtype: float64

In [33]:
obj3 = pd.Series(['b','p','y'],index=[1,2,3])
obj3

1    b
2    p
3    y
dtype: object

In [34]:
#重新设置索引，并向后填充缺失值，一般用于处理时间序列
obj3.reindex(range(6),method='ffill')

0    NaN
1      b
2      p
3      y
4      y
5      y
dtype: object

In [35]:
#借助DataFrame，reindex可以修改行索引和列
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['o','t','c'])
frame

Unnamed: 0,o,t,c
a,0,1,2
c,3,4,5
d,6,7,8


In [36]:
#默认重新索引行
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,o,t,c
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [37]:
#重新索引列可用columns关键字
frame.reindex(columns=['o','t','u'])

Unnamed: 0,o,t,u
a,0,1,
c,3,4,
d,6,7,


In [38]:
#丢弃指定轴上的项
#drop方法返回的是一个在指定轴上删除了指定值的新对象
obj = pd.Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [40]:
obj2 = obj.drop('c')
obj2

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [41]:
obj3 = obj.drop(['a','b'])
obj3

c    2.0
d    3.0
e    4.0
dtype: float64

In [43]:
#DataFrame可以删除任意轴上的索引值
data = pd.DataFrame(np.arange(16).reshape((4,4)),index=['o','c','u','n'],columns=['1','2','3','4'])
data

Unnamed: 0,1,2,3,4
o,0,1,2,3
c,4,5,6,7
u,8,9,10,11
n,12,13,14,15


In [44]:
data.drop(['c'])

Unnamed: 0,1,2,3,4
o,0,1,2,3
u,8,9,10,11
n,12,13,14,15


In [45]:
data

Unnamed: 0,1,2,3,4
o,0,1,2,3
c,4,5,6,7
u,8,9,10,11
n,12,13,14,15


In [47]:
# 用axis=1或者axis='columns'可以删除指定列的索引，删除多个索引时用[]
data.drop('2',axis=1)

Unnamed: 0,1,3,4
o,0,2,3
c,4,6,7
u,8,10,11
n,12,14,15


一般情况下，使用drop不会改变原来的Series和DataFrame的形状和数据，若需要对原数据做修改，可调用参数inplace=True

## 算术运算

In [48]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
print(df1,df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0       a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [49]:
df2.loc[1,'b']=np.nan

In [50]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [51]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [52]:
#处理df相加时的缺失值问题
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [53]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [54]:
arr[0]

array([0, 1, 2, 3])

In [55]:
arr-arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [56]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['u','o','t','c'])
frame

Unnamed: 0,b,d,e
u,0.0,1.0,2.0
o,3.0,4.0,5.0
t,6.0,7.0,8.0
c,9.0,10.0,11.0


In [58]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: u, dtype: float64

In [59]:
frame-series

Unnamed: 0,b,d,e
u,0.0,0.0,0.0
o,3.0,3.0,3.0
t,6.0,6.0,6.0
c,9.0,9.0,9.0


In [62]:
#如果希望匹配行并在列上广播，则需要使用算术方法
series2 = frame['d']
series2.shape

(4,)

In [66]:
#可以指定axis为0或1，默认为1
frame.sub(series2)

Unnamed: 0,b,c,d,e,o,t,u
u,,,,,,,
o,,,,,,,
t,,,,,,,
c,,,,,,,


In [67]:
#函数应用和映射
frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),index=list('uotc'))
frame

Unnamed: 0,b,d,e
u,-0.155054,0.123979,0.166948
o,-0.034414,1.411601,0.995285
t,-0.607112,0.359924,0.551023
c,-0.579876,0.994882,0.806515


In [68]:
np.abs(frame)

Unnamed: 0,b,d,e
u,0.155054,0.123979,0.166948
o,0.034414,1.411601,0.995285
t,0.607112,0.359924,0.551023
c,0.579876,0.994882,0.806515


In [69]:
f = lambda x:x.max()-x.min()

In [71]:
frame.apply(f,axis=1)

u    0.322003
o    1.446015
t    1.158136
c    1.574758
dtype: float64

## 排序和排名