In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
pd.set_option('display.width', 200)

In [2]:
# 创建一个以日期为元素的Series
dates = pd.date_range('20170101', periods=5)
print(dates) 

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D')


In [3]:
# 将这个日期Series作为索引赋给一个DataFrame：
df = pd.DataFrame(np.random.randn(5, 4),index=dates,columns=list('ABCD'))
print(df) 

                   A         B         C         D
2017-01-01  0.299966  0.671892  0.933409  0.082688
2017-01-02 -1.324337 -0.757183  0.827981  0.460106
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-04 -0.069098 -0.420785  1.374985  1.053452
2017-01-05  0.908485 -0.970194 -1.229919 -0.943637


In [4]:
# 只要是能转换成Series的对象，都可以用于创建DataFrame：

df2 = pd.DataFrame({ 'A' : 1., 'B': pd.Timestamp('20170214'), 'C': pd.Series(1.6,index=list(range(4)),dtype='float64'), \
                    'D' : np.array([4] * 4, dtype='int64'), 'E' : 'hello pandas!' })
print(df2) 

     A          B    C  D              E
0  1.0 2017-02-14  1.6  4  hello pandas!
1  1.0 2017-02-14  1.6  4  hello pandas!
2  1.0 2017-02-14  1.6  4  hello pandas!
3  1.0 2017-02-14  1.6  4  hello pandas!


In [7]:
print(df.shape) # 数据的大小
# dataframe.head()和dataframe.tail()可以查看数据的头五行和尾五行，若需要改变行数，可在括号内指定
print(df.head()) 
print(df.tail(3)) 

(5, 4)
                   A         B         C         D
2017-01-01  0.299966  0.671892  0.933409  0.082688
2017-01-02 -1.324337 -0.757183  0.827981  0.460106
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-04 -0.069098 -0.420785  1.374985  1.053452
2017-01-05  0.908485 -0.970194 -1.229919 -0.943637
                   A         B         C         D
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-04 -0.069098 -0.420785  1.374985  1.053452
2017-01-05  0.908485 -0.970194 -1.229919 -0.943637


In [8]:
# dataframe.describe()提供了DataFrame中纯数值数据的统计信息：
print(df.describe()) 

              A         B         C         D
count  5.000000  5.000000  5.000000  5.000000
mean   0.063064 -0.265515  0.509299 -0.095237
std    0.852113  0.673187  1.009060  0.928382
min   -1.324337 -0.970194 -1.229919 -1.128794
25%   -0.069098 -0.757183  0.640041 -0.943637
50%    0.299966 -0.420785  0.827981  0.082688
75%    0.500304  0.148695  0.933409  0.460106
max    0.908485  0.671892  1.374985  1.053452


In [9]:
#按行列排序，即按照索引（行名）或者列名进行排序，可调用dataframe.sort_index，指定axis=0表示按索引（行名）排序，axis=1表示按列名排序，并可指定升序或者降序：
print(df.sort_index(axis=1, ascending=False).head()) 

                   D         C         B         A
2017-01-01  0.082688  0.933409  0.671892  0.299966
2017-01-02  0.460106  0.827981 -0.757183 -1.324337
2017-01-03 -1.128794  0.640041  0.148695  0.500304
2017-01-04  1.053452  1.374985 -0.420785 -0.069098
2017-01-05 -0.943637 -1.229919 -0.970194  0.908485


In [13]:
# 第二种排序是按值排序，可指定列名和排序方式，默认的是升序排序：
print(df.sort_values(by='A').head()) 
df = df.sort_values(by=['A', 'C'], ascending=[False, True])
print(df.head()) 

                   A         B         C         D
2017-01-02 -1.324337 -0.757183  0.827981  0.460106
2017-01-04 -0.069098 -0.420785  1.374985  1.053452
2017-01-01  0.299966  0.671892  0.933409  0.082688
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-05  0.908485 -0.970194 -1.229919 -0.943637
                   A         B         C         D
2017-01-05  0.908485 -0.970194 -1.229919 -0.943637
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-01  0.299966  0.671892  0.933409  0.082688
2017-01-04 -0.069098 -0.420785  1.374985  1.053452
2017-01-02 -1.324337 -0.757183  0.827981  0.460106


In [18]:
# :来获取部行或者全部列
print(df.iloc[:][1:3])

                   A         B         C         D
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-01  0.299966  0.671892  0.933409  0.082688


In [19]:
# 特定列大于均值的数据
print(df[df.A > df.A.mean()].head()) 

                   A         B         C         D
2017-01-05  0.908485 -0.970194 -1.229919 -0.943637
2017-01-03  0.500304  0.148695  0.640041 -1.128794
2017-01-01  0.299966  0.671892  0.933409  0.082688


In [22]:
# isin()函数可方便地过滤DataFrame中的数据：
print(df[df['D'].isin(['0.082688', '0.460106'])].head()) 

Empty DataFrame
Columns: [A, B, C, D]
Index: []


In [27]:
# 指定0按列进行，指定1按行进行
print(df.mean(0))
print(df.sum(0))
print(df.sum(1))

A    0.063064
B   -0.265515
C    0.509299
D   -0.095237
dtype: float64
A    0.315320
B   -1.327575
C    2.546497
D   -0.476185
dtype: float64
2017-01-05   -2.235265
2017-01-03    0.160246
2017-01-01    1.987956
2017-01-04    1.938554
2017-01-02   -0.793434
dtype: float64


In [28]:
# value_counts函数可以方便地统计频数：
print(df['C'].value_counts().head()) 

 0.640041    1
-1.229919    1
 1.374985    1
 0.827981    1
 0.933409    1
Name: C, dtype: int64


In [29]:
# Series可以调用map函数来对每个元素应用一个函数，DataFrame可以调用apply函数对每一列（行）应用一个函数
print(df[['B']].apply(lambda x: (x - x.min()) / (x.max() - x.min())).head()) 

                   B
2017-01-05  0.000000
2017-01-03  0.681383
2017-01-01  1.000000
2017-01-04  0.334580
2017-01-02  0.129720


In [30]:
# 数据分组处理
df_grp = df.groupby('D')
grp_mean = df_grp.mean()
print(grp_mean) 

                  A         B         C
D                                      
-1.128794  0.500304  0.148695  0.640041
-0.943637  0.908485 -0.970194 -1.229919
 0.082688  0.299966  0.671892  0.933409
 0.460106 -1.324337 -0.757183  0.827981
 1.053452 -0.069098 -0.420785  1.374985
