# 10 Minute to pandas 

This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the [Cookbook](http://pandas.pydata.org/pandas-docs/stable/cookbook.html#cookbook)


## Object Creation

In [2]:
# Customarily, we import as follows:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Creating a Series by passing a list of values, 
# letting pandas create a default integer index:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [33]:
# Creating a datetime index
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [34]:
# Creating a DataFrame by passing a numpy array, 
# with a datetime index and labeled columns:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.403382,0.002752,-0.34964,1.132727
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582
2013-01-04,0.744772,-2.213077,-0.040606,0.810121
2013-01-05,0.701906,-0.672111,0.899968,0.814862
2013-01-06,-0.565506,-0.382663,0.155713,-2.148247


In [6]:
# Creating a DataFrame by passing a dict of objects 
# that can be converted to series-like.
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D' : np.array([3] * 4, dtype='int32'),
                    'E' : pd.Categorical(['test', 'train', 'test', 'train']),
                    'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [35]:
# Having specific dtypes  
# 查看不同列的详细数据类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data 

In [8]:
# See the top rows of the frame  
# 查看结构顶部的行
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.127322,1.408264,-0.387315,1.240476
2013-01-02,0.500769,0.105346,-0.45098,0.228161
2013-01-03,-0.040311,-1.397404,-1.289349,-0.805045
2013-01-04,-0.885135,-0.59778,-1.282561,-0.704639
2013-01-05,-0.965352,1.634706,-0.923429,-0.355444


In [9]:
# See the bottom rows of the frame  
# 查看结构底部的行
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,0.500769,0.105346,-0.45098,0.228161
2013-01-03,-0.040311,-1.397404,-1.289349,-0.805045
2013-01-04,-0.885135,-0.59778,-1.282561,-0.704639
2013-01-05,-0.965352,1.634706,-0.923429,-0.355444
2013-01-06,0.340763,-0.171868,-0.336571,-1.045246


In [10]:
# Display the index 
# 显示索引
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# Display the columns
# 显示列
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# Display the underlying numpy data 
# 显示底层的数据
df.values

array([[ 0.12732173,  1.40826379, -0.3873153 ,  1.24047638],
       [ 0.50076883,  0.105346  , -0.45097958,  0.22816145],
       [-0.04031148, -1.39740371, -1.28934887, -0.80504495],
       [-0.88513532, -0.59777971, -1.28256097, -0.70463881],
       [-0.96535225,  1.63470603, -0.9234287 , -0.35544387],
       [ 0.34076348, -0.17186843, -0.33657082, -1.04524616]])

In [13]:
# Describe shows a quick statistic summary of your data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.153658,0.163544,-0.778367,-0.240289
std,0.625848,1.170103,0.445341,0.849458
min,-0.965352,-1.397404,-1.289349,-1.045246
25%,-0.673929,-0.491302,-1.192778,-0.779943
50%,0.043505,-0.033261,-0.687204,-0.530041
75%,0.287403,1.082534,-0.403231,0.08226
max,0.500769,1.634706,-0.336571,1.240476


In [14]:
# Transposing your data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.127322,0.500769,-0.040311,-0.885135,-0.965352,0.340763
B,1.408264,0.105346,-1.397404,-0.59778,1.634706,-0.171868
C,-0.387315,-0.45098,-1.289349,-1.282561,-0.923429,-0.336571
D,1.240476,0.228161,-0.805045,-0.704639,-0.355444,-1.045246


In [15]:
# sort by an axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.240476,-0.387315,1.408264,0.127322
2013-01-02,0.228161,-0.45098,0.105346,0.500769
2013-01-03,-0.805045,-1.289349,-1.397404,-0.040311
2013-01-04,-0.704639,-1.282561,-0.59778,-0.885135
2013-01-05,-0.355444,-0.923429,1.634706,-0.965352
2013-01-06,-1.045246,-0.336571,-0.171868,0.340763


In [16]:
# sorting by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,-0.040311,-1.397404,-1.289349,-0.805045
2013-01-04,-0.885135,-0.59778,-1.282561,-0.704639
2013-01-06,0.340763,-0.171868,-0.336571,-1.045246
2013-01-02,0.500769,0.105346,-0.45098,0.228161
2013-01-01,0.127322,1.408264,-0.387315,1.240476
2013-01-05,-0.965352,1.634706,-0.923429,-0.355444


## Selection 

Note: While standard Python/Numpy expressions for selection and setting are intuitive and come in handy for interactive work,for production code,we recommend the optimized pandas data access methods,.at,.iat,.loc,.iloc and .ix.  

虽然标准的Python/Numpy的选择和设置表达式都能够直接派上用场，但是作为工程使用的代码，我们推荐使用经过优化的pandas数据访问方式

###  Getting

In [17]:
# Selecting a single column,which yields a Series,equivalent to df.A  
# 选择一个单独的列,将会返回一个Series,等同于df.A
df['A']

2013-01-01    0.127322
2013-01-02    0.500769
2013-01-03   -0.040311
2013-01-04   -0.885135
2013-01-05   -0.965352
2013-01-06    0.340763
Freq: D, Name: A, dtype: float64

In [18]:
# Selecting via [], which slices the rows.
# 通过[]进行选择,这将会对行进行切片
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.127322,1.408264,-0.387315,1.240476
2013-01-02,0.500769,0.105346,-0.45098,0.228161
2013-01-03,-0.040311,-1.397404,-1.289349,-0.805045


In [45]:
# You can also slice the rows by index
# 也可以通过[]按索引对行进行切片
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582
2013-01-04,0.744772,-2.213077,-0.040606,0.810121


### Selection by Label

In [42]:
# For getting a cross section using a label  
# 使用标签来获取一个交叉的区域
df.loc[dates[0]]

A    1.403382
B    0.002752
C   -0.349640
D    1.132727
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
# Selecting on a multi-axis by label  
# 通过标签来在多个轴上进行选择
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.127322,1.408264
2013-01-02,0.500769,0.105346
2013-01-03,-0.040311,-1.397404
2013-01-04,-0.885135,-0.59778
2013-01-05,-0.965352,1.634706
2013-01-06,0.340763,-0.171868


In [47]:
# Showing label slicing, both endpoints are included  
# 显示标签切片, 包括两个端点
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,1.391821,-1.306427
2013-01-03,-0.060376,-0.078067
2013-01-04,0.744772,-2.213077


In [25]:
# Reduction in the dimensions of the returned object
# 对返回的对象进行维度缩减
df.loc['20130102', ['A', 'B']]

A    0.500769
B    0.105346
Name: 2013-01-02 00:00:00, dtype: float64

In [27]:
# For getting a scalar value
# 获取一个标量值
df.loc[dates[0], 'A']

0.12732173273040512

In [30]:
# For getting fast access to a scalar(equiv to the prior method)
# 快速获取一个标量值(与上一个方法等价)
df.at[dates[0], 'A']

0.12732173273040512

###  Selection by Position

In [49]:
# Select via the position of the passed integers
# 通过整数进行位置选择（选择的是行）
df.iloc[3]

A    0.744772
B   -2.213077
C   -0.040606
D    0.810121
Name: 2013-01-04 00:00:00, dtype: float64

In [51]:
# By integer alices, acting similar to numpy/python
# 通过数值进行切片，与numpy/python中的情况类似
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.744772,-2.213077
2013-01-05,0.701906,-0.672111


In [52]:
# For slicing rows explicitly
# 对行进行切片
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582


In [53]:
# For slicing columns explicity
# 对列进行切片
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.002752,-0.34964
2013-01-02,-1.306427,-1.161965
2013-01-03,-0.078067,0.239518
2013-01-04,-2.213077,-0.040606
2013-01-05,-0.672111,0.899968
2013-01-06,-0.382663,0.155713


In [54]:
# For getting a value explicitly
# 获取一个特定的值
df.iloc[1, 1]

-1.3064268596953375

In [56]:
# For getting fast access to a scalar(equiv to the prior method)
# 快速访问获取一个特定的值(相当于前一个方法)
df.iat[1, 1]

-1.3064268596953375

### Bolean Indexing

In [58]:
# Using a single column's values to select data.
# 使用一个单独列的值来选择数据
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.403382,0.002752,-0.34964,1.132727
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212
2013-01-04,0.744772,-2.213077,-0.040606,0.810121
2013-01-05,0.701906,-0.672111,0.899968,0.814862


In [59]:
# Selecting values from a DataFrame where a boolean condition is met.
# 从一个Bool条件满足的情况下,从DataFrame中选择数据.
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.403382,0.002752,,1.132727
2013-01-02,1.391821,,,
2013-01-03,,,0.239518,
2013-01-04,0.744772,,,0.810121
2013-01-05,0.701906,,0.899968,0.814862
2013-01-06,,,0.155713,


In [60]:
# Using the isin() method for filtering
# 使用isin()方法过滤
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.403382,0.002752,-0.34964,1.132727,one
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212,one
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582,two
2013-01-04,0.744772,-2.213077,-0.040606,0.810121,three
2013-01-05,0.701906,-0.672111,0.899968,0.814862,four
2013-01-06,-0.565506,-0.382663,0.155713,-2.148247,three


In [61]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582,two
2013-01-05,0.701906,-0.672111,0.899968,0.814862,four


###  Setting

In [62]:
# Setting a new column automatically aligns the data by the indexes
# 设置一个新的列,按索引自动对齐
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods=6))
s1

2013-01-01    1
2013-01-02    2
2013-01-03    3
2013-01-04    4
2013-01-05    5
2013-01-06    6
Freq: D, dtype: int64

In [72]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.002752,-0.34964,1.132727,1
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212,2
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582,3
2013-01-04,0.744772,-2.213077,-0.040606,0.810121,4
2013-01-05,0.701906,-0.672111,0.899968,0.814862,5
2013-01-06,-0.565506,-0.382663,0.155713,-2.148247,6


In [66]:
# Setting values by label
# 按照标签设置值
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.002752,-0.34964,1.132727
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582
2013-01-04,0.744772,-2.213077,-0.040606,0.810121
2013-01-05,0.701906,-0.672111,0.899968,0.814862
2013-01-06,-0.565506,-0.382663,0.155713,-2.148247


In [73]:
# Setting values by position
# 按照位置设置值
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.34964,1.132727,1
2013-01-02,1.391821,-1.306427,-1.161965,-0.230212,2
2013-01-03,-0.060376,-0.078067,0.239518,-1.169582,3
2013-01-04,0.744772,-2.213077,-0.040606,0.810121,4
2013-01-05,0.701906,-0.672111,0.899968,0.814862,5
2013-01-06,-0.565506,-0.382663,0.155713,-2.148247,6


In [75]:
# Setting by assigning with a numpy array
# 通过numpy数组设定新值
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.34964,5,1
2013-01-02,1.391821,-1.306427,-1.161965,5,2
2013-01-03,-0.060376,-0.078067,0.239518,5,3
2013-01-04,0.744772,-2.213077,-0.040606,5,4
2013-01-05,0.701906,-0.672111,0.899968,5,5
2013-01-06,-0.565506,-0.382663,0.155713,5,6


In [84]:
# A where operation with setting
# 通过where操作来设置新的值
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.34964,-5,-1
2013-01-02,-1.391821,-1.306427,-1.161965,-5,-2
2013-01-03,-0.060376,-0.078067,-0.239518,-5,-3
2013-01-04,-0.744772,-2.213077,-0.040606,-5,-4
2013-01-05,-0.701906,-0.672111,-0.899968,-5,-5
2013-01-06,-0.565506,-0.382663,-0.155713,-5,-6


## Missing Data

pandas primarily uses the value np.nan to represent missing data. it is by default not include in computations.

pandas首选np.nan表示缺失数据.它默认不会参与到计算中.