In [6]:
import pandas as pd
import numpy as np
#Series对象可以通过类似一维数组的结构生成：
a=pd.Series([1,2,3,4])
a

0    1
1    2
2    3
3    4
dtype: int64

左栏是该Series对象的标记，即index参数需要指定的内容；右边是对应的数据。在不指定index参数的情况下，标记默认是RangeIndex(n)，其中n是data的长度。标记可以用.index属性查看：

In [7]:
a.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
#可以用标记来索引对应位置的值：
a[0]

1

In [9]:
#Series对象的标记类似于字典，因此与数组不同的是Series不支持负数索引。类似于字典意味着标记可以不是整数。
a=pd.Series([1,2,3,4],index=['a','b','c','d'])
a

a    1
b    2
c    3
d    4
dtype: int64

In [10]:
a['b']

2

In [11]:
#Series对象类似于字典，所以也可以通过字典生成，在不给定index参数的情况下，标记默认为字典的键，并按照字典中键的顺序进行排列：
d={'c':3,'b':2,'a':1}
pd.Series(d)

c    3
b    2
a    1
dtype: int64

如果指定了index参数，Pandas会按照参数指定的顺序从字典中依次读取相应的值，并让不存在的键对应np.nan：

In [12]:
a=pd.Series(d,index=['c','d','b','e'])
a

c    3.0
d    NaN
b    2.0
e    NaN
dtype: float64

In [13]:
a['c']

3.0

In [14]:
a['d']

nan

In [15]:
type(a['d'])

numpy.float64

In [16]:
pd.Series(5,index=range(3))

0    5
1    5
2    5
dtype: int64

In [17]:
pd.Series(5,index=['a','b','c','d'])

a    5
b    5
c    5
d    5
dtype: int64

In [18]:
#Series对象可以从数组中生成，也支持一些数组的操作。
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
s

a    1.116215
b    0.636358
c   -0.931787
d   -1.351702
e   -0.533844
dtype: float64

In [19]:
#虽然标记不是数字，仍然可以像数组一样按照位置顺序对它进行索引：
s[0]

1.1162150118266738

In [20]:
s[:3]

a    1.116215
b    0.636358
c   -0.931787
dtype: float64

In [21]:
s[s>s.median()]

a    1.116215
b    0.636358
dtype: float64

In [22]:
s[[4,3,1]]

e   -0.533844
d   -1.351702
b    0.636358
dtype: float64

In [23]:
np.exp(s)

a    3.053276
b    1.889586
c    0.393849
d    0.258799
e    0.586346
dtype: float64

In [24]:
s['a']

1.1162150118266738

In [25]:
s['e']=12

In [26]:
'e' in s

True

In [27]:
0 in s

False

In [28]:
s.get('f')

In [29]:
s.get('f',np.nan)

nan

In [30]:
s+s

a     2.232430
b     1.272716
c    -1.863574
d    -2.703404
e    24.000000
dtype: float64

In [31]:
s*2

a     2.232430
b     1.272716
c    -1.863574
d    -2.703404
e    24.000000
dtype: float64

s[1：]的标记为b～e，而s[：-1]的标记为a～d，它们相加时，会先对两个Series中各自独有的部分补上np.nan，然后再相加，从而得到：

In [32]:
s[1:]+s[:-1]

a         NaN
b    1.272716
c   -1.863574
d   -2.703404
e         NaN
dtype: float64

In [33]:
s1=pd.Series([1,2,3],index=['a','b','c'])
s2=pd.Series([1.,2.,3.,4.],index=['a','b','c','d'])
d={'one':s1,'two':s2}
df=pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [34]:
#df的列标记是传入字典的键，可以用属性.columns查看：
df.columns

Index(['one', 'two'], dtype='object')

In [35]:
#行标记是两个Series对象标记的并集，Pandas会自动将两个Series对象的标记对齐：
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [36]:
pd.DataFrame(d,index=['d','b','a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [37]:
pd.DataFrame(d,index=['d','b','a'],columns=['two','three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [38]:
d={'one':[1,2,3,4],'tow':[4,3,2,1]}
pd.DataFrame(d)

Unnamed: 0,one,tow
0,1,4
1,2,3
2,3,2
3,4,1


In [39]:
pd.DataFrame(d,index=['a','b','c','d'])

Unnamed: 0,one,tow
a,1,4
b,2,3
c,3,2
d,4,1


In [40]:
data=[{'a':1,'b':2},{'a':5,'b':10,'c':20}]
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [41]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [42]:
df['three']=df['one']*df['two']
df['flag']=df['one']>2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [43]:
df['four']=4
df

Unnamed: 0,one,two,three,flag,four
a,1.0,1.0,1.0,False,4
b,2.0,2.0,4.0,False,4
c,3.0,3.0,9.0,True,4
d,,4.0,,False,4


In [44]:
del df['two']
three=df.pop('three')
df

Unnamed: 0,one,flag,four
a,1.0,False,4
b,2.0,False,4
c,3.0,True,4
d,,False,4


In [45]:
df['foo']=pd.Series([1,2,3],index=['a','d','e'])
df

Unnamed: 0,one,flag,four,foo
a,1.0,False,4,1.0
b,2.0,False,4,
c,3.0,True,4,
d,,False,4,2.0


In [46]:
df.insert(1,'bar',df['one'])
df

Unnamed: 0,one,bar,flag,four,foo
a,1.0,1.0,False,4,1.0
b,2.0,2.0,False,4,
c,3.0,3.0,True,4,
d,,,False,4,2.0


In [47]:
df.loc['b']

one       2.0
bar       2.0
flag    False
four        4
foo       NaN
Name: b, dtype: object

In [48]:
df.iloc[1]

one       2.0
bar       2.0
flag    False
four        4
foo       NaN
Name: b, dtype: object

In [49]:
df1=pd.DataFrame(np.random.randn(10,4),columns=['A','B','C','D'])
df2=pd.DataFrame(np.random.randn(7,3),columns=['A','B','C'])
df1+df2

Unnamed: 0,A,B,C,D
0,-1.762835,2.394626,-1.575875,
1,0.74955,-0.979762,-0.714256,
2,0.206636,-0.804484,2.808644,
3,-1.539564,0.223176,0.759566,
4,1.717665,-1.659796,0.505994,
5,-0.889913,-1.149184,0.330504,
6,0.849886,-1.587782,-1.206516,
7,,,,
8,,,,
9,,,,


In [50]:
df1-df1.iloc[0]#将df1的所有行都减去df1的第一行df1.iloc[0]：

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,0.907885,-1.508271,-0.997092,2.982307
2,0.268136,-1.468073,1.224291,1.193463
3,1.107913,-0.330962,1.263959,0.899993
4,1.943541,-2.356,0.848792,0.109591
5,1.285741,0.061007,1.277521,-1.034694
6,1.735306,-2.576701,-0.840028,1.049261
7,0.24382,-1.806338,1.417799,-1.81806
8,0.946799,-1.52425,0.662337,-0.240212
9,1.553832,-0.71594,-0.376633,1.207785


In [51]:
#日期时间序列可以用函数pd.date_range()创建：
dates=pd.date_range('1/1/2000',periods=8)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [52]:
df=pd.DataFrame(np.random.randn(8,4),
               index=dates,
               columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,0.343844,0.277137,-0.475641,0.981414
2000-01-02,-0.2208,-0.442019,-1.355112,-0.428871
2000-01-03,0.438719,-0.479836,-2.230672,-0.24241
2000-01-04,0.531423,-1.776272,-1.458472,-0.092026
2000-01-05,0.623262,-0.395045,-0.648811,1.096107
2000-01-06,-1.226499,0.421514,0.385189,0.738806
2000-01-07,-1.67319,0.861694,1.514319,-1.107508
2000-01-08,1.634175,-0.101921,1.199405,-0.26688


In [53]:
df.head()

Unnamed: 0,A,B,C,D
2000-01-01,0.343844,0.277137,-0.475641,0.981414
2000-01-02,-0.2208,-0.442019,-1.355112,-0.428871
2000-01-03,0.438719,-0.479836,-2.230672,-0.24241
2000-01-04,0.531423,-1.776272,-1.458472,-0.092026
2000-01-05,0.623262,-0.395045,-0.648811,1.096107


In [54]:
df.tail(2)

Unnamed: 0,A,B,C,D
2000-01-07,-1.67319,0.861694,1.514319,-1.107508
2000-01-08,1.634175,-0.101921,1.199405,-0.26688


In [55]:
s=df['A']
s[dates[5]]

-1.226498954734058

In [56]:
df[['A','B']]

Unnamed: 0,A,B
2000-01-01,0.343844,0.277137
2000-01-02,-0.2208,-0.442019
2000-01-03,0.438719,-0.479836
2000-01-04,0.531423,-1.776272
2000-01-05,0.623262,-0.395045
2000-01-06,-1.226499,0.421514
2000-01-07,-1.67319,0.861694
2000-01-08,1.634175,-0.101921


In [57]:
df.C

2000-01-01   -0.475641
2000-01-02   -1.355112
2000-01-03   -2.230672
2000-01-04   -1.458472
2000-01-05   -0.648811
2000-01-06    0.385189
2000-01-07    1.514319
2000-01-08    1.199405
Freq: D, Name: C, dtype: float64

In [58]:
sa=pd.Series([1,2,3],index=list('abc'))
sa.a

1

In [59]:
s[:5]

2000-01-01    0.343844
2000-01-02   -0.220800
2000-01-03    0.438719
2000-01-04    0.531423
2000-01-05    0.623262
Freq: D, Name: A, dtype: float64

In [60]:
s[::2]

2000-01-01    0.343844
2000-01-03    0.438719
2000-01-05    0.623262
2000-01-07   -1.673190
Freq: 2D, Name: A, dtype: float64

In [61]:
s[::-1]

2000-01-08    1.634175
2000-01-07   -1.673190
2000-01-06   -1.226499
2000-01-05    0.623262
2000-01-04    0.531423
2000-01-03    0.438719
2000-01-02   -0.220800
2000-01-01    0.343844
Freq: -1D, Name: A, dtype: float64

In [62]:
#DataFrame也支持切片操作，与索引不同的是，DataFrame的切片是对行进行操作：
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,0.343844,0.277137,-0.475641,0.981414
2000-01-02,-0.2208,-0.442019,-1.355112,-0.428871
2000-01-03,0.438719,-0.479836,-2.230672,-0.24241


In [63]:
dates=pd.date_range('1/1/2000',periods=8)
df=pd.DataFrame(np.random.randn(8,4),
               index=dates,
               columns=['A','B','C','D'])
#索引第二行的数据：
df.loc[dates[1]]

A   -1.897348
B   -1.662094
C   -0.379308
D    0.557469
Name: 2000-01-02 00:00:00, dtype: float64

In [64]:
#日期类型还使用一个字符串进行索引，只要这个字符串符合日期的格式
df.loc['20000102']

A   -1.897348
B   -1.662094
C   -0.379308
D    0.557469
Name: 2000-01-02 00:00:00, dtype: float64

In [65]:
df.loc['20000102':'20000104']

Unnamed: 0,A,B,C,D
2000-01-02,-1.897348,-1.662094,-0.379308,0.557469
2000-01-03,-1.145021,1.524567,-0.907224,-1.604204
2000-01-04,-0.117576,0.541529,-0.195564,0.243901


In [66]:
df.loc['20000101','A']

-0.08574397033703259

In [67]:
df.loc['20000101',['A','C']]

A   -0.085744
C   -0.122739
Name: 2000-01-01 00:00:00, dtype: float64

In [68]:
df.loc['20000107':,['A','C']]

Unnamed: 0,A,C
2000-01-07,1.012364,0.125299
2000-01-08,0.80568,0.805166


.loc属性基于标记对DataFrame对象进行索引，而.iloc属性则基于位置对DataFrame对象进行索引.

In [69]:
df.iloc[1]

A   -1.897348
B   -1.662094
C   -0.379308
D    0.557469
Name: 2000-01-02 00:00:00, dtype: float64

In [70]:
df.iloc[1,0]

-1.8973483967162694

In [71]:
df.iloc[1:3,[0,3]]
#与.loc属性不同的是，.iloc的切片不包含最后一个元素。

Unnamed: 0,A,D
2000-01-02,-1.897348,0.557469
2000-01-03,-1.145021,-1.604204


In [72]:
#如果只需要索引单个值，最快速的方法是使用.at索引标记：
df.at[dates[5],'A']

0.060246130533985955

In [73]:
df.iat[5,0]

0.060246130533985955

In [74]:
s=df['A']
s[s>0]

2000-01-06    0.060246
2000-01-07    1.012364
2000-01-08    0.805680
Freq: D, Name: A, dtype: float64

In [75]:
#还可以使用取反符号“～”，如使用取反表达式“～（s>0）”进行索引，得到所有小于或等于0的值：
s[~(s>0)]

2000-01-01   -0.085744
2000-01-02   -1.897348
2000-01-03   -1.145021
2000-01-04   -0.117576
2000-01-05   -0.696077
Freq: D, Name: A, dtype: float64

In [76]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2000-01-06,0.060246,2.413598,-1.712647,1.713738
2000-01-07,1.012364,1.787156,0.125299,-0.357188
2000-01-08,0.80568,-0.710984,0.805166,0.62691


In [77]:
s=pd.Series([4,4,1,2,3])

In [78]:
#调用.isin()方法可以检查Series对象的每个值是否在给定的序列中：
s.isin([3,4,6])

0     True
1     True
2    False
3    False
4     True
dtype: bool

In [79]:
s[s.isin([3,4,6])]

0    4
1    4
4    3
dtype: int64

In [80]:
df=pd.DataFrame(np.random.randn(4,5),columns=list('abcde'))
df.index=pd.date_range('20000101',periods=4)

In [81]:
df.iloc[[2,3],[3,4]]=np.nan
df

Unnamed: 0,a,b,c,d,e
2000-01-01,-0.629345,-0.927768,1.667299,1.178359,-2.225056
2000-01-02,1.246828,-2.488568,-0.990801,0.536726,-0.566698
2000-01-03,-1.290779,-0.057105,-0.055299,,
2000-01-04,-0.413815,-0.132468,0.86225,,


In [82]:
#可以使用.dropna()方法去掉所有包含缺失值的行
df.dropna(how='any')

Unnamed: 0,a,b,c,d,e
2000-01-01,-0.629345,-0.927768,1.667299,1.178359,-2.225056
2000-01-02,1.246828,-2.488568,-0.990801,0.536726,-0.566698


这里，how参数设为“any”表示只要该行有缺失值就会被去掉，如果换成“all”，则表示只有该行全部缺失时才会被去掉。.dropna()方法还可以通过axis参数指定对行还是对列进行操作，默认值为0，即对行；如果要对列进行操作，可以将axis参数设为1：

In [83]:
df.dropna(axis=1,how='any')

Unnamed: 0,a,b,c
2000-01-01,-0.629345,-0.927768,1.667299
2000-01-02,1.246828,-2.488568,-0.990801
2000-01-03,-1.290779,-0.057105,-0.055299
2000-01-04,-0.413815,-0.132468,0.86225


In [84]:
df.fillna(value=100)

Unnamed: 0,a,b,c,d,e
2000-01-01,-0.629345,-0.927768,1.667299,1.178359,-2.225056
2000-01-02,1.246828,-2.488568,-0.990801,0.536726,-0.566698
2000-01-03,-1.290779,-0.057105,-0.055299,100.0,100.0
2000-01-04,-0.413815,-0.132468,0.86225,100.0,100.0


In [85]:
df

Unnamed: 0,a,b,c,d,e
2000-01-01,-0.629345,-0.927768,1.667299,1.178359,-2.225056
2000-01-02,1.246828,-2.488568,-0.990801,0.536726,-0.566698
2000-01-03,-1.290779,-0.057105,-0.055299,,
2000-01-04,-0.413815,-0.132468,0.86225,,


In [86]:
df=pd.DataFrame(np.random.randn(3,5),
               index=list('abc'),
               columns=list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
a,-1.611927,0.370541,0.410944,-0.388926,0.036339
b,-1.421891,0.944304,-1.86789,0.179481,-1.038056
c,-0.518938,-0.905686,-0.516223,0.582732,0.867957


In [87]:
df.to_csv('foo.csv')

In [88]:
pd.read_csv('foo.csv')

Unnamed: 0.1,Unnamed: 0,A,B,C,D,E
0,a,-1.611927,0.370541,0.410944,-0.388926,0.036339
1,b,-1.421891,0.944304,-1.86789,0.179481,-1.038056
2,c,-0.518938,-0.905686,-0.516223,0.582732,0.867957


In [89]:
pd.read_csv('foo.csv',index_col=0)
#参数index_col指定行标记所在的列：

Unnamed: 0,A,B,C,D,E
a,-1.611927,0.370541,0.410944,-0.388926,0.036339
b,-1.421891,0.944304,-1.86789,0.179481,-1.038056
c,-0.518938,-0.905686,-0.516223,0.582732,0.867957


In [90]:
#也可以在保存的时候，忽略行标记：
df.to_csv('fo.csv',index=False)
pd.read_csv('fo.csv')

Unnamed: 0,A,B,C,D,E
0,-1.611927,0.370541,0.410944,-0.388926,0.036339
1,-1.421891,0.944304,-1.86789,0.179481,-1.038056
2,-0.518938,-0.905686,-0.516223,0.582732,0.867957


In [91]:
url='http://www.shfe.com.cn/historyData/MarketData_Year_2021.zip'
import urllib.request
urllib.request.urlretrieve(url,'2021.zip')

RemoteDisconnected: Remote end closed connection without response

In [93]:
import zipfile
f=zipfile.ZipFile('2021.zip')
f.namelist()

['╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.01.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.02.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.03.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.04.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.05.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.06.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.07.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.08.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.09.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.10.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.11.xls',
 '╦∙─┌║╧╘╝╨╨╟Θ▒¿▒φ2021.12.xls']

In [94]:
print(f.namelist()[0].encode('cp437').decode('gbk'))

所内合约行情报表2021.01.xls


In [5]:
import pandas as pd
pdi = pd.Index([1,2,3,4,5],name = '001')
print(pdi.values)
print(pdi.dtype)
print(pdi.array)
print(pdi.shape)
lst1 = pdi.to_list()
print(type(lst1))
ps1 = pdi.to_series()
print(ps1)

[1 2 3 4 5]
int64
<PandasArray>
[1, 2, 3, 4, 5]
Length: 5, dtype: int64
(5,)
<class 'list'>
001
1    1
2    2
3    3
4    4
5    5
Name: 001, dtype: int64


In [6]:
df1 = pdi.to_frame()
print(df1)

     001
001     
1      1
2      2
3      3
4      4
5      5


In [7]:
print(df1.shape)

(5, 1)


In [8]:
import numpy as np
import pandas as pd
from pandas import Series
print('空的序列对象：')
emptySeries = pd.Series()
print(emptySeries)

空的序列对象：
Series([], dtype: float64)


  emptySeries = pd.Series()


In [9]:
npArray = np.random.randn(5)
rnds = pd.Series(npArray,index = ['a','b','c','d','e'])
print(rnds)

a    0.895215
b   -0.399217
c   -0.522717
d    1.969658
e    0.165384
dtype: float64


In [10]:
datalist = [81.,77.,99.]
s = pd.Series(datalist)
print(s)

0    81.0
1    77.0
2    99.0
dtype: float64


In [12]:
scores = Series(data=[81.,77.,99.],index=['Math','English','Chinese'])
print(scores)

Math       81.0
English    77.0
Chinese    99.0
dtype: float64


In [13]:
datadict = {'Math':81.,'English':77.,'Chinese':99.}
scores = pd.Series(datadict)
print(scores)

Math       81.0
English    77.0
Chinese    99.0
dtype: float64


In [14]:
subscores = pd.Series(datadict,index=['English','Chinese'])
print(subscores)

English    77.0
Chinese    99.0
dtype: float64


In [16]:
s = pd.Series(5)
print(s)

0    5
dtype: int64


In [17]:
s = pd.Series(5,index = [1,2,3,4,5])
print(s)

1    5
2    5
3    5
4    5
5    5
dtype: int64


In [18]:
ps = pd.Series([11,21,31,41,51],index=['a','b','c','d','e'])
print(ps)

a    11
b    21
c    31
d    41
e    51
dtype: int64


In [19]:
ps['d']

41

In [20]:
ps[3]

41

In [21]:
print(ps[1:4])#和上面不同，这里有索引

b    21
c    31
d    41
dtype: int64


In [22]:
print(ps[[1,3]])

b    21
d    41
dtype: int64


In [23]:
print(ps[['b','d']])

b    21
d    41
dtype: int64


In [24]:
print(ps.loc['a':'c']) #包括结束位置的元素

a    11
b    21
c    31
dtype: int64


In [26]:
print(ps.iloc[1:3]) #不包括结束位置的元素

b    21
c    31
dtype: int64


In [27]:
print(ps[ps>30])

c    31
d    41
e    51
dtype: int64


In [29]:
ps[ps>30] = 0
print(ps)

a    11
b    21
c     0
d     0
e     0
dtype: int64


In [31]:
data = {'one':pd.Series([1.,2.,3.],index=['a','b','c']),
       'two':pd.Series([1.,2.,3.,4.],index=['a','b','c','d'])}
df = pd.DataFrame(data)
print(df)

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0


In [32]:
df = pd.DataFrame(data,index=['d','b','a'])
print(df)

   one  two
d  NaN  4.0
b  2.0  2.0
a  1.0  1.0


In [34]:
df = pd.DataFrame(data,index=['d','b','a'],columns=['two','three'])
print(df)

   two three
d  4.0   NaN
b  2.0   NaN
a  1.0   NaN


In [35]:
data = {'one':[1.,2.,3.,4.],'two':[4.,3.,2.,1]}
df = pd.DataFrame(data)
print(df)

   one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0


In [36]:
df = pd.DataFrame(data,index=['a','b','c','d'])
print(df)

   one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0


In [37]:
data = np.zeros((3,),dtype=[('A','i4'),('B','f4'),('C','a10')])
print(data) #列名和数据类型

[(0, 0., b'') (0, 0., b'') (0, 0., b'')]


In [38]:
data[:] = [(1,2.,'hello'),(2,3.,'world'),(3,4.,'langh')]
print(data)

[(1, 2., b'hello') (2, 3., b'world') (3, 4., b'langh')]


In [40]:
df = pd.DataFrame(data)
print(df)

   A    B         C
0  1  2.0  b'hello'
1  2  3.0  b'world'
2  3  4.0  b'langh'


In [41]:
df = pd.DataFrame(data,index=['first','second','third'])
print(df)

        A    B         C
first   1  2.0  b'hello'
second  2  3.0  b'world'
third   3  4.0  b'langh'


In [42]:
df = pd.DataFrame(data,columns=['C','A','B'])
print(df)

          C  A    B
0  b'hello'  1  2.0
1  b'world'  2  3.0
2  b'langh'  3  4.0


In [44]:
data = [{'a':1,'b':2},{'a':5,'b':10,'c':20}]
df = pd.DataFrame(data)
print(df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [46]:
df = pd.DataFrame(data,index=['first','second'])
print(df)

        a   b     c
first   1   2   NaN
second  5  10  20.0


In [48]:
df = pd.DataFrame(data,columns=['a','b'])
print(df)

   a   b
0  1   2
1  5  10


In [51]:
df = pd.DataFrame({('a','b'):{('A','B'):1,('A','C'):2},
                  ('a','a'):{('A','C'):3,('A','B'):4},
                  ('a','c'):{('A','B'):5,('A','C'):6},
                  ('b','a'):{('A','C'):7,('A','B'):8},
                  ('b','b'):{('A','D'):9,('A','B'):10}})
print(df)                

       a              b      
       b    a    c    a     b
A B  1.0  4.0  5.0  8.0  10.0
  C  2.0  3.0  6.0  7.0   NaN
  D  NaN  NaN  NaN  NaN   9.0


In [52]:
type(df.index)

pandas.core.indexes.multi.MultiIndex

In [53]:
ps = pd.Series([1,2,3],index = ['one','two','three'])
print(ps)

one      1
two      2
three    3
dtype: int64


In [54]:
df = pd.DataFrame(ps,columns=['colName'])
print(df)

       colName
one          1
two          2
three        3


In [55]:
df = pd.DataFrame.from_dict(dict([('A',[1,2,3]),('B',[4,5,6])]))
print(df)

   A  B
0  1  4
1  2  5
2  3  6


In [57]:
df = pd.DataFrame.from_dict(dict([('A',[1,2,3]),('B',[4,5,6])]),orient='index',columns=['one','two','three'])
print(df)         #orient决定键值作为行标签还是列标签

   one  two  three
A    1    2      3
B    4    5      6


In [58]:
data = np.zeros((2,),dtype=[('A','i4'),('B','f4'),('C','a10')])
data[:] = [(1,2.,'hello'),(2,3.,'world')]
df = pd.DataFrame.from_records(data)
print(df)

   A    B         C
0  1  2.0  b'hello'
1  2  3.0  b'world'


In [59]:
df = pd.DataFrame.from_records([(1,3,7,0,3,6),(3,1,4,1,5,9)],
                              columns=list('abcABC'),index=list('abc'))
print(df)

       A  B  C
a b c         
1 3 7  0  3  6
3 1 4  1  5  9


In [62]:
data = {'one':pd.Series([1.,2.,3.],index=['a','b','c']),
       'two':pd.Series([1.,2.,3.,4.],index=['a','b','c','d'])}
df = pd.DataFrame(data)
print(df)

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0


In [63]:
df['three'] = df['one']*df['two']
df['flag'] = df['one']>2
print(df)

   one  two  three   flag
a  1.0  1.0    1.0  False
b  2.0  2.0    4.0  False
c  3.0  3.0    9.0   True
d  NaN  4.0    NaN  False


In [64]:
df['foo'] = 'bar'
print(df)

   one  two  three   flag  foo
a  1.0  1.0    1.0  False  bar
b  2.0  2.0    4.0  False  bar
c  3.0  3.0    9.0   True  bar
d  NaN  4.0    NaN  False  bar


In [67]:
df.insert(1,'ba',df['one'])
print(df)

   one   ba  bar  two  three   flag  foo
a  1.0  1.0  1.0  1.0    1.0  False  bar
b  2.0  2.0  2.0  2.0    4.0  False  bar
c  3.0  3.0  3.0  3.0    9.0   True  bar
d  NaN  NaN  NaN  4.0    NaN  False  bar


In [68]:
del df['two']
df.pop('three')
print(df)

   one   ba  bar   flag  foo
a  1.0  1.0  1.0  False  bar
b  2.0  2.0  2.0  False  bar
c  3.0  3.0  3.0   True  bar
d  NaN  NaN  NaN  False  bar


In [69]:
ps = df.loc['b']  #索引
print(ps)

one       2.0
ba        2.0
bar       2.0
flag    False
foo       bar
Name: b, dtype: object


In [70]:
ps = df.iloc[2]   #位置
print(df)

   one   ba  bar   flag  foo
a  1.0  1.0  1.0  False  bar
b  2.0  2.0  2.0  False  bar
c  3.0  3.0  3.0   True  bar
d  NaN  NaN  NaN  False  bar


In [71]:
from datetime import datetime
dti = pd.to_datetime([datetime(2015,7,3),'4th of July,2015','2015-Jul-6','07-07-2015','20150708'])
print(dti)

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)


In [73]:
#创建时间序列
idx = pd.date_range('2018-01-01',periods=5,freq='H')
print(idx)

DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00',
               '2018-01-01 02:00:00', '2018-01-01 03:00:00',
               '2018-01-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')


In [74]:
ts = pd.Series(range(len(idx)),index=idx)
print(ts)

2018-01-01 00:00:00    0
2018-01-01 01:00:00    1
2018-01-01 02:00:00    2
2018-01-01 03:00:00    3
2018-01-01 04:00:00    4
Freq: H, dtype: int64


In [79]:
dateIndex = pd.date_range(start='6/1/2013',periods=20,freq='D')
dateSale = np.random.randint(12,19,20)
tsSale = pd.Series(dateSale,index=dateIndex)
print(tsSale)

2013-06-01    15
2013-06-02    13
2013-06-03    16
2013-06-04    17
2013-06-05    16
2013-06-06    13
2013-06-07    17
2013-06-08    14
2013-06-09    15
2013-06-10    16
2013-06-11    18
2013-06-12    14
2013-06-13    17
2013-06-14    15
2013-06-15    12
2013-06-16    14
2013-06-17    13
2013-06-18    12
2013-06-19    12
2013-06-20    14
Freq: D, dtype: int32


In [82]:
valCount = tsSale.value_counts(ascending=True) #正序
print(valCount)

18    1
15    3
13    3
16    3
17    3
12    3
14    4
dtype: int64


In [83]:
stat = tsSale.describe()
print(stat)

count    20.000000
mean     14.650000
std       1.843195
min      12.000000
25%      13.000000
50%      14.500000
75%      16.000000
max      18.000000
dtype: float64


In [84]:
sx = pd.Series(range(10,20))
print(sx)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64


In [85]:
sy = pd.Series([2,1,4,5,8,12,18,25,96,48])
print(sy)

0     2
1     1
2     4
3     5
4     8
5    12
6    18
7    25
8    96
9    48
dtype: int64


In [86]:
rpxy = sx.corr(sy)
print(rpxy)

0.7586402890911867


In [87]:
rpyx = sy.corr(sx)
print(rpyx)

0.7586402890911869


In [88]:
rsxy = sx.corr(sy,method='spearman')
print(rsxy)

0.9757575757575757


In [89]:
rkxy = sx.corr(sy,method='kendall')
print(rkxy)

0.911111111111111


In [91]:
rxy = df.corr(method='pearson')
print(rxy)

           one        ba       bar      flag
one   1.000000  1.000000  1.000000  0.866025
ba    1.000000  1.000000  1.000000  0.866025
bar   1.000000  1.000000  1.000000  0.866025
flag  0.866025  0.866025  0.866025  1.000000


  rxy = df.corr(method='pearson')


In [92]:
df = pd.DataFrame([('bird','Falconiformes',389.0),
                  ('bird','Psittsciformes',24.0),
                  ('mammal','Carnivora',80.2),
                  ('mammal','Peimates',np.nan),
                  ('mammal','Carnovara',58.0)],
                 index=['falcon','parrot','lion','monky','leopard'],
                 columns=['class','order','max_speed'])
print(df)

          class           order  max_speed
falcon     bird   Falconiformes      389.0
parrot     bird  Psittsciformes       24.0
lion     mammal       Carnivora       80.2
monky    mammal        Peimates        NaN
leopard  mammal       Carnovara       58.0


In [93]:
grouped = df.groupby('class')
print(grouped.sum())

        max_speed
class            
bird        413.0
mammal      138.2


  print(grouped.sum())


In [96]:
grouped = df.groupby(['class','order'])
print(grouped.sum())

                       max_speed
class  order                    
bird   Falconiformes       389.0
       Psittsciformes       24.0
mammal Carnivora            80.2
       Carnovara            58.0
       Peimates              0.0


In [97]:
arrays = [['falcon','falcon','parrot','parrot'],
         ['captive','wild','captive','wild']]
index = pd.MultiIndex.from_arrays(arrays,names=('animal','type'))
df = pd.DataFrame({'max_speed':[390.,350.,30.,20.]},index=index)
print(df)

                max_speed
animal type              
falcon captive      390.0
       wild         350.0
parrot captive       30.0
       wild          20.0


In [98]:
grouped = df.groupby(level=0).mean()
print(grouped)

        max_speed
animal           
falcon      370.0
parrot       25.0


In [100]:
grouped = df.groupby(level='type').mean()
print(grouped)

         max_speed
type              
captive      210.0
wild         185.0
