In [1]:
from pandas import Series

In [2]:
import pandas as pd
import numpy as np

In [3]:
s = Series([1,4,'ww','tt'])
s

0     1
1     4
2    ww
3    tt
dtype: object

In [4]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
s.values

array([1, 4, 'ww', 'tt'], dtype=object)

In [6]:
s2 = Series(['wangxing','man',24],index=['name','sex','age'])

In [7]:
s2

name    wangxing
sex          man
age           24
dtype: object

In [8]:
s2['name']

'wangxing'

In [9]:
s2['name'] = 'wuda'
s2

name    wuda
sex      man
age       24
dtype: object

In [10]:
sd = {'python':1,'c++':2,'c#':3}
s3 = Series(sd)
s3

python    1
c++       2
c#        3
dtype: int64

In [11]:
np.exp(s3)

python     2.718282
c++        7.389056
c#        20.085537
dtype: float64

In [12]:
s4 = Series(sd,index=['java','c++','c#'])
s4

java    NaN
c++     2.0
c#      3.0
dtype: float64

In [13]:
print( pd.isnull(s4))
print( s4.isnull())
print( s4.notnull())

java     True
c++     False
c#      False
dtype: bool
java     True
c++     False
c#      False
dtype: bool
java    False
c++      True
c#       True
dtype: bool


In [14]:
s4.index = ['语文','数学','English']
s4

语文         NaN
数学         2.0
English    3.0
dtype: float64

In [15]:
s4.name='student'
s4.index.name='study'
s4

study
语文         NaN
数学         2.0
English    3.0
Name: student, dtype: float64

In [16]:
s4 * 2

study
语文         NaN
数学         4.0
English    6.0
Name: student, dtype: float64

In [17]:
s4[s4 > 2]

study
English    3.0
Name: student, dtype: float64

In [18]:
#可以将Series看成是一个有定长的有序字典，因为它是索引值到数据值的一个映射
#因此，一些字典函数也可以在这里使用：
'English' in s4

True

In [19]:
3 in s4

False

In [20]:
3 in s4.values

True

In [21]:
from pandas import Series,DataFrame

In [22]:
#最常用的一种是直接传入一个由等长列表或numpy数组组成的字典：
data={'names':['Bob','Jane','Jack','Ann'],
       'sex':['M','F','M','F'],
       'age':[21,30,26,28]}

In [23]:
f1 = DataFrame(data)
f1

Unnamed: 0,names,sex,age
0,Bob,M,21
1,Jane,F,30
2,Jack,M,26
3,Ann,F,28


In [24]:
f1 = DataFrame(data,columns=['names','sex','age'])
f1

Unnamed: 0,names,sex,age
0,Bob,M,21
1,Jane,F,30
2,Jack,M,26
3,Ann,F,28


In [25]:
f1=DataFrame(data,columns=['names','sex','age','id'],index=['a','b','c','d'])
f1

Unnamed: 0,names,sex,age,id
a,Bob,M,21,
b,Jane,F,30,
c,Jack,M,26,
d,Ann,F,28,


In [26]:
print(f1['sex'])
print(f1.age)

a    M
b    F
c    M
d    F
Name: sex, dtype: object
a    21
b    30
c    26
d    28
Name: age, dtype: int64


In [27]:
#loc——通过行标签索引行数据
f1.loc['c']

names    Jack
sex         M
age        26
id        NaN
Name: c, dtype: object

In [28]:
#iloc--通过行号获取行数据
f1.iloc[2]

names    Jack
sex         M
age        26
id        NaN
Name: c, dtype: object

In [29]:
#注意：使用行标签时，：两边是全包括，使用行号时是左闭右开
f1.loc['b':'c']

Unnamed: 0,names,sex,age,id
b,Jane,F,30,
c,Jack,M,26,


In [30]:
f1.iloc[2:4]

Unnamed: 0,names,sex,age,id
c,Jack,M,26,
d,Ann,F,28,


In [31]:
print( f1.loc[:,['sex']]) #等效于frame1['sex']
print( f1.iloc[:,[1]] )

  sex
a   M
b   F
c   M
d   F
  sex
a   M
b   F
c   M
d   F


In [32]:
print( f1.loc[:,'sex':])
print( f1.iloc[:,1:4])

  sex  age   id
a   M   21  NaN
b   F   30  NaN
c   M   26  NaN
d   F   28  NaN
  sex  age   id
a   M   21  NaN
b   F   30  NaN
c   M   26  NaN
d   F   28  NaN


In [33]:
f1['id']=np.arange(4) #给列赋值,赋值是列表或数组时，长度必须相匹配
f1

Unnamed: 0,names,sex,age,id
a,Bob,M,21,0
b,Jane,F,30,1
c,Jack,M,26,2
d,Ann,F,28,3


In [34]:
#若赋值的是一个Series，将精确匹配DataFrame的索引，空位将补上缺失值
f1['id'] = Series([11,12],index=['a','c'])
f1

Unnamed: 0,names,sex,age,id
a,Bob,M,21,11.0
b,Jane,F,30,
c,Jack,M,26,12.0
d,Ann,F,28,


In [35]:
#为不存在的列赋一个新列
f1['Female'] = f1.sex=='F'
f1

Unnamed: 0,names,sex,age,id,Female
a,Bob,M,21,11.0,False
b,Jane,F,30,,True
c,Jack,M,26,12.0,False
d,Ann,F,28,,True


In [36]:
#删除列数据
del f1['Female']
f1

Unnamed: 0,names,sex,age,id
a,Bob,M,21,11.0
b,Jane,F,30,
c,Jack,M,26,12.0
d,Ann,F,28,


In [37]:
pop={'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame2=DataFrame(pop)

In [38]:
frame2

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [39]:
frame2.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [40]:
frame2.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [41]:
obj = Series([1.2,3.4,-7.8,5.6],index = ['d','a','c','b'])
obj

d    1.2
a    3.4
c   -7.8
b    5.6
dtype: float64

In [42]:
#rreindex会根据新索引重新排列，若索引值不存在，则引入缺省值
obj.reindex(['a','b','c','d','e']) 

a    3.4
b    5.6
c   -7.8
d    1.2
e    NaN
dtype: float64

In [43]:
obj.reindex( ['a','b','c','d','e'], fill_value = 0) #结果中的NaN将变成0

a    3.4
b    5.6
c   -7.8
d    1.2
e    0.0
dtype: float64

In [44]:
obj1 = Series(['yellow','pink','blue'],index=[0,2,4])
obj1

0    yellow
2      pink
4      blue
dtype: object

In [45]:
obj1.reindex(range(6),method='ffill')

0    yellow
1    yellow
2      pink
3      pink
4      blue
5      blue
dtype: object

In [46]:
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],\
                columns=['US','UK','CN'])
frame

Unnamed: 0,US,UK,CN
a,0,1,2
b,3,4,5
c,6,7,8


In [47]:
#使用columns关键字即可重新索引列
frame1 = frame.reindex(['a','b','c','e'])
frame1

Unnamed: 0,US,UK,CN
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
e,,,


In [48]:
states=['US','HK','CN']
frame.reindex( columns = states)
frame

Unnamed: 0,US,UK,CN
a,0,1,2
b,3,4,5
c,6,7,8


In [49]:
#可以同时对行和列进行索引，但插值只能按照行（即轴0）应用
frame.reindex( index = ['a','b','c','e'], method = 'ffill',columns = states)

Unnamed: 0,US,HK,CN
a,0,1,2
b,3,4,5
c,6,7,8
e,6,7,8


In [50]:
obj = Series([1.2,3.4,-7.8,5.6],index=['d','a','c','b'])
obj

d    1.2
a    3.4
c   -7.8
b    5.6
dtype: float64

In [51]:
obj.drop('c')               #返回的是新对象，obj没变

d    1.2
a    3.4
b    5.6
dtype: float64

In [52]:
obj.drop(['a','d'])

c   -7.8
b    5.6
dtype: float64

In [53]:
#对于DataFrame，可以删除任意轴上的索引值：
frame=DataFrame(np.arange(18).reshape((6,3)),index=['a','b','c','a','b','c'],columns=['US','UK','CN'])
frame

Unnamed: 0,US,UK,CN
a,0,1,2
b,3,4,5
c,6,7,8
a,9,10,11
b,12,13,14
c,15,16,17


In [54]:
print(type(frame))
print(type(frame.UK))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [55]:
frame.head() #显示前五行

Unnamed: 0,US,UK,CN
a,0,1,2
b,3,4,5
c,6,7,8
a,9,10,11
b,12,13,14


In [56]:
frame.tail() #显示后五行

Unnamed: 0,US,UK,CN
b,3,4,5
c,6,7,8
a,9,10,11
b,12,13,14
c,15,16,17


In [57]:
frame.drop('a')   #默认axis=0  ==  axis=1

Unnamed: 0,US,UK,CN
b,3,4,5
c,6,7,8
b,12,13,14
c,15,16,17


In [58]:
frame.drop(['US','UK'],axis=1) #axis= 'columns'

Unnamed: 0,CN
a,2
b,5
c,8
a,11
b,14
c,17


In [59]:
frame

Unnamed: 0,US,UK,CN
a,0,1,2
b,3,4,5
c,6,7,8
a,9,10,11
b,12,13,14
c,15,16,17


In [60]:
frame.drop(['a'], inplace = True)

In [61]:
frame

Unnamed: 0,US,UK,CN
b,3,4,5
c,6,7,8
b,12,13,14
c,15,16,17


In [62]:
s1 = Series([1.2,3.4,-7.8,5.6],index=['d','a','c','b'])
s2 = Series([1.1,2.2,3.3,4.4,5.5,6.6],index=['a','b','c','d','e','f'])

s1+s2     #在不重叠的地方引入了NA值，且缺失值会在算术运算过程中传播

a    4.5
b    7.8
c   -4.5
d    5.6
e    NaN
f    NaN
dtype: float64

In [63]:
newdata = {"lang":{"firstline":"python","secondline":"java"},\
           "price":{"firstline":8000}} 
f4 = DataFrame(newdata) 
f4 

Unnamed: 0,lang,price
firstline,python,8000.0
secondline,java,


In [64]:
DataFrame(newdata, index=["firstline","secondline","thirdline"])

Unnamed: 0,lang,price
firstline,python,8000.0
secondline,java,
thirdline,,


In [65]:
newdata1 = {'username':{'first':'wangxing','second':'dadiao'},\
            'age':{'first':24,'second':25}}
f6 = DataFrame(newdata1,columns=['username','age','sex'])
f6

Unnamed: 0,username,age,sex
first,wangxing,24,
second,dadiao,25,


In [66]:
f6['sex'] = 'man'
f6

Unnamed: 0,username,age,sex
first,wangxing,24,man
second,dadiao,25,man


In [67]:
ssex = Series(['男','女'],index=['first','second'])
ssex

first     男
second    女
dtype: object

In [68]:
f6['sex'] = ssex
f6

Unnamed: 0,username,age,sex
first,wangxing,24,男
second,dadiao,25,女


In [69]:
f6['age']['second'] = 30
f6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f6['age']['second'] = 30


Unnamed: 0,username,age,sex
first,wangxing,24,男
second,dadiao,30,女


In [70]:
data = pd.DataFrame(
    np.arange(16).reshape((4,4)),
    index=['湖南','湖北','山东','山西'],
    columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
湖南,0,1,2,3
湖北,4,5,6,7
山东,8,9,10,11
山西,12,13,14,15


In [71]:
data['two']

湖南     1
湖北     5
山东     9
山西    13
Name: two, dtype: int32

In [72]:
data[['two','three']]

Unnamed: 0,two,three
湖南,1,2
湖北,5,6
山东,9,10
山西,13,14


In [73]:
data[:2]

Unnamed: 0,one,two,three,four
湖南,0,1,2,3
湖北,4,5,6,7


In [74]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
湖北,4,5,6,7
山东,8,9,10,11
山西,12,13,14,15


In [75]:
data['three']>5

湖南    False
湖北     True
山东     True
山西     True
Name: three, dtype: bool

In [76]:
data>5

Unnamed: 0,one,two,three,four
湖南,False,False,False,False
湖北,False,False,True,True
山东,True,True,True,True
山西,True,True,True,True


In [77]:
data[data<5] = 0
data

Unnamed: 0,one,two,three,four
湖南,0,0,0,0
湖北,0,5,6,7
山东,8,9,10,11
山西,12,13,14,15


In [78]:
data.loc['山西',['two','three']] # arr[[],[]]

two      13
three    14
Name: 山西, dtype: int32

In [79]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: 山东, dtype: int32

In [80]:
data.iloc[2] # arr[[2][:]]

one       8
two       9
three    10
four     11
Name: 山东, dtype: int32

In [81]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
湖北,7,0,5
山东,11,8,9


In [82]:
data.loc[:'山东','two']

湖南    0
湖北    5
山东    9
Name: two, dtype: int32

In [83]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
湖北,0,5,6
山东,8,9,10
山西,12,13,14


In [84]:
data.iloc[:,:3]

Unnamed: 0,one,two,three
湖南,0,0,0
湖北,0,5,6
山东,8,9,10
山西,12,13,14


In [85]:
frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),\
                     index=['河南','河北','山东','山西'])
frame

Unnamed: 0,b,d,e
河南,1.055745,-0.975849,-0.321863
河北,-0.983724,-0.82395,-2.181121
山东,0.554216,0.313783,-1.328829
山西,2.142301,-1.261388,-1.950661


In [86]:
np.abs(frame)

Unnamed: 0,b,d,e
河南,1.055745,0.975849,0.321863
河北,0.983724,0.82395,2.181121
山东,0.554216,0.313783,1.328829
山西,2.142301,1.261388,1.950661


In [87]:
f = lambda x:x.max()-x.min()
frame.apply(f,axis='index')

b    3.126025
d    1.575171
e    1.859258
dtype: float64

In [88]:
frame.apply(f,axis='columns')

河南    2.031594
河北    1.357171
山东    1.883044
山西    4.092962
dtype: float64

In [89]:
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])

In [90]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.983724,-1.261388,-2.181121
max,2.142301,0.313783,-0.321863


In [91]:
fmt = lambda x:'%.2f' % x

In [92]:
frame.applymap(fmt)

Unnamed: 0,b,d,e
河南,1.06,-0.98,-0.32
河北,-0.98,-0.82,-2.18
山东,0.55,0.31,-1.33
山西,2.14,-1.26,-1.95


In [93]:
frame['e'].map(fmt)

河南    -0.32
河北    -2.18
山东    -1.33
山西    -1.95
Name: e, dtype: object

In [94]:
obj = pd.Series(list('cadaabbcc'))
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [95]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [96]:
uniques.sort()
uniques

array(['a', 'b', 'c', 'd'], dtype=object)