# 10 Minute to pandas 

This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the [Cookbook](http://pandas.pydata.org/pandas-docs/stable/cookbook.html#cookbook)


## Object Creation

In [131]:
# Customarily, we import as follows:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [132]:
# Creating a Series by passing a list of values, 
# letting pandas create a default integer index:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [133]:
# Creating a datetime index
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [134]:
# Creating a DataFrame by passing a numpy array, 
# with a datetime index and labeled columns:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.741997,-1.27476,1.193085,-0.767934
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-03,-2.003783,-1.286573,0.468985,0.58772
2013-01-04,0.931235,0.597973,-0.070079,0.513918
2013-01-05,-0.35121,-0.475518,0.561652,0.726096
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774


In [135]:
# Creating a DataFrame by passing a dict of objects 
# that can be converted to series-like.
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D' : np.array([3] * 4, dtype='int32'),
                    'E' : pd.Categorical(['test', 'train', 'test', 'train']),
                    'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [136]:
# Having specific dtypes  
# 查看不同列的详细数据类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data 

In [137]:
# See the top rows of the frame  
# 查看结构顶部的行
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.741997,-1.27476,1.193085,-0.767934
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-03,-2.003783,-1.286573,0.468985,0.58772
2013-01-04,0.931235,0.597973,-0.070079,0.513918
2013-01-05,-0.35121,-0.475518,0.561652,0.726096


In [138]:
# See the bottom rows of the frame  
# 查看结构底部的行
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-03,-2.003783,-1.286573,0.468985,0.58772
2013-01-04,0.931235,0.597973,-0.070079,0.513918
2013-01-05,-0.35121,-0.475518,0.561652,0.726096
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774


In [139]:
# Display the index 
# 显示索引
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [140]:
# Display the columns
# 显示列
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [141]:
# Display the underlying numpy data 
# 显示底层的数据
df.values

array([[ 0.74199717, -1.27475957,  1.19308515, -0.76793431],
       [ 0.4895686 , -0.12214325,  0.52169685,  0.54976301],
       [-2.00378344, -1.28657337,  0.4689854 ,  0.58772022],
       [ 0.93123468,  0.59797323, -0.07007888,  0.51391779],
       [-0.35121   , -0.47551761,  0.56165242,  0.72609636],
       [-0.98154565,  0.24935735, -0.01917555, -0.0377399 ]])

In [142]:
# Describe shows a quick statistic summary of your data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.195623,-0.385277,0.442694,0.261971
std,1.142782,0.781081,0.460528,0.568921
min,-2.003783,-1.286573,-0.070079,-0.767934
25%,-0.823962,-1.074949,0.102865,0.100175
50%,0.069179,-0.29883,0.495341,0.53184
75%,0.67889,0.156482,0.551664,0.578231
max,0.931235,0.597973,1.193085,0.726096


In [143]:
# Transposing your data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.741997,0.489569,-2.003783,0.931235,-0.35121,-0.981546
B,-1.27476,-0.122143,-1.286573,0.597973,-0.475518,0.249357
C,1.193085,0.521697,0.468985,-0.070079,0.561652,-0.019176
D,-0.767934,0.549763,0.58772,0.513918,0.726096,-0.03774


In [144]:
# sort by an axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.767934,1.193085,-1.27476,0.741997
2013-01-02,0.549763,0.521697,-0.122143,0.489569
2013-01-03,0.58772,0.468985,-1.286573,-2.003783
2013-01-04,0.513918,-0.070079,0.597973,0.931235
2013-01-05,0.726096,0.561652,-0.475518,-0.35121
2013-01-06,-0.03774,-0.019176,0.249357,-0.981546


In [145]:
# sorting by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,-2.003783,-1.286573,0.468985,0.58772
2013-01-01,0.741997,-1.27476,1.193085,-0.767934
2013-01-05,-0.35121,-0.475518,0.561652,0.726096
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774
2013-01-04,0.931235,0.597973,-0.070079,0.513918


## Selection 

Note: While standard Python/Numpy expressions for selection and setting are intuitive and come in handy for interactive work,for production code,we recommend the optimized pandas data access methods,.at,.iat,.loc,.iloc and .ix.  

虽然标准的Python/Numpy的选择和设置表达式都能够直接派上用场，但是作为工程使用的代码，我们推荐使用经过优化的pandas数据访问方式

###  Getting

In [146]:
# Selecting a single column,which yields a Series,equivalent to df.A  
# 选择一个单独的列,将会返回一个Series,等同于df.A
df['A']

2013-01-01    0.741997
2013-01-02    0.489569
2013-01-03   -2.003783
2013-01-04    0.931235
2013-01-05   -0.351210
2013-01-06   -0.981546
Freq: D, Name: A, dtype: float64

In [147]:
# Selecting via [], which slices the rows.
# 通过[]进行选择,这将会对行进行切片
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.741997,-1.27476,1.193085,-0.767934
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-03,-2.003783,-1.286573,0.468985,0.58772


In [148]:
# You can also slice the rows by index
# 也可以通过[]按索引对行进行切片
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-03,-2.003783,-1.286573,0.468985,0.58772
2013-01-04,0.931235,0.597973,-0.070079,0.513918


### Selection by Label

In [149]:
# For getting a cross section using a label  
# 使用标签来获取一个交叉的区域
df.loc[dates[0]]

A    0.741997
B   -1.274760
C    1.193085
D   -0.767934
Name: 2013-01-01 00:00:00, dtype: float64

In [150]:
# Selecting on a multi-axis by label  
# 通过标签来在多个轴上进行选择
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.741997,-1.27476
2013-01-02,0.489569,-0.122143
2013-01-03,-2.003783,-1.286573
2013-01-04,0.931235,0.597973
2013-01-05,-0.35121,-0.475518
2013-01-06,-0.981546,0.249357


In [151]:
# Showing label slicing, both endpoints are included  
# 显示标签切片, 包括两个端点
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.489569,-0.122143
2013-01-03,-2.003783,-1.286573
2013-01-04,0.931235,0.597973


In [152]:
# Reduction in the dimensions of the returned object
# 对返回的对象进行维度缩减
df.loc['20130102', ['A', 'B']]

A    0.489569
B   -0.122143
Name: 2013-01-02 00:00:00, dtype: float64

In [153]:
# For getting a scalar value
# 获取一个标量值
df.loc[dates[0], 'A']

0.74199717277805721

In [154]:
# For getting fast access to a scalar(equiv to the prior method)
# 快速获取一个标量值(与上一个方法等价)
df.at[dates[0], 'A']

0.74199717277805721

###  Selection by Position

In [155]:
# Select via the position of the passed integers
# 通过整数进行位置选择（选择的是行）
df.iloc[3]

A    0.931235
B    0.597973
C   -0.070079
D    0.513918
Name: 2013-01-04 00:00:00, dtype: float64

In [156]:
# By integer alices, acting similar to numpy/python
# 通过数值进行切片，与numpy/python中的情况类似
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.931235,0.597973
2013-01-05,-0.35121,-0.475518


In [157]:
# For slicing rows explicitly
# 对行进行切片
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-03,-2.003783,-1.286573,0.468985,0.58772


In [158]:
# For slicing columns explicity
# 对列进行切片
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-1.27476,1.193085
2013-01-02,-0.122143,0.521697
2013-01-03,-1.286573,0.468985
2013-01-04,0.597973,-0.070079
2013-01-05,-0.475518,0.561652
2013-01-06,0.249357,-0.019176


In [159]:
# For getting a value explicitly
# 获取一个特定的值
df.iloc[1, 1]

-0.12214325451411531

In [160]:
# For getting fast access to a scalar(equiv to the prior method)
# 快速访问获取一个特定的值(相当于前一个方法)
df.iat[1, 1]

-0.12214325451411531

### Bolean Indexing

In [161]:
# Using a single column's values to select data.
# 使用一个单独列的值来选择数据
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.741997,-1.27476,1.193085,-0.767934
2013-01-02,0.489569,-0.122143,0.521697,0.549763
2013-01-04,0.931235,0.597973,-0.070079,0.513918


In [162]:
# Selecting values from a DataFrame where a boolean condition is met.
# 从一个Bool条件满足的情况下,从DataFrame中选择数据.
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.741997,,1.193085,
2013-01-02,0.489569,,0.521697,0.549763
2013-01-03,,,0.468985,0.58772
2013-01-04,0.931235,0.597973,,0.513918
2013-01-05,,,0.561652,0.726096
2013-01-06,,0.249357,,


In [163]:
# Using the isin() method for filtering
# 使用isin()方法过滤
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.741997,-1.27476,1.193085,-0.767934,one
2013-01-02,0.489569,-0.122143,0.521697,0.549763,one
2013-01-03,-2.003783,-1.286573,0.468985,0.58772,two
2013-01-04,0.931235,0.597973,-0.070079,0.513918,three
2013-01-05,-0.35121,-0.475518,0.561652,0.726096,four
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774,three


In [164]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-2.003783,-1.286573,0.468985,0.58772,two
2013-01-05,-0.35121,-0.475518,0.561652,0.726096,four


###  Setting

In [165]:
# Setting a new column automatically aligns the data by the indexes
# 设置一个新的列,按索引自动对齐
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods=6))
s1

2013-01-01    1
2013-01-02    2
2013-01-03    3
2013-01-04    4
2013-01-05    5
2013-01-06    6
Freq: D, dtype: int64

In [166]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.741997,-1.27476,1.193085,-0.767934,1
2013-01-02,0.489569,-0.122143,0.521697,0.549763,2
2013-01-03,-2.003783,-1.286573,0.468985,0.58772,3
2013-01-04,0.931235,0.597973,-0.070079,0.513918,4
2013-01-05,-0.35121,-0.475518,0.561652,0.726096,5
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774,6


In [167]:
# Setting values by label
# 按照标签设置值
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-1.27476,1.193085,-0.767934,1
2013-01-02,0.489569,-0.122143,0.521697,0.549763,2
2013-01-03,-2.003783,-1.286573,0.468985,0.58772,3
2013-01-04,0.931235,0.597973,-0.070079,0.513918,4
2013-01-05,-0.35121,-0.475518,0.561652,0.726096,5
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774,6


In [168]:
# Setting values by position
# 按照位置设置值
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.193085,-0.767934,1
2013-01-02,0.489569,-0.122143,0.521697,0.549763,2
2013-01-03,-2.003783,-1.286573,0.468985,0.58772,3
2013-01-04,0.931235,0.597973,-0.070079,0.513918,4
2013-01-05,-0.35121,-0.475518,0.561652,0.726096,5
2013-01-06,-0.981546,0.249357,-0.019176,-0.03774,6


In [169]:
# Setting by assigning with a numpy array
# 通过numpy数组设定新值
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.193085,5,1
2013-01-02,0.489569,-0.122143,0.521697,5,2
2013-01-03,-2.003783,-1.286573,0.468985,5,3
2013-01-04,0.931235,0.597973,-0.070079,5,4
2013-01-05,-0.35121,-0.475518,0.561652,5,5
2013-01-06,-0.981546,0.249357,-0.019176,5,6


In [170]:
# A where operation with setting
# 通过where操作来设置新的值
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.193085,-5,-1
2013-01-02,-0.489569,-0.122143,-0.521697,-5,-2
2013-01-03,-2.003783,-1.286573,-0.468985,-5,-3
2013-01-04,-0.931235,-0.597973,-0.070079,-5,-4
2013-01-05,-0.35121,-0.475518,-0.561652,-5,-5
2013-01-06,-0.981546,-0.249357,-0.019176,-5,-6


## Missing Data

pandas primarily uses the value np.nan to represent missing data. it is by default not include in computations.

pandas主要使用np.nan表示缺失数据.它默认不会参与到计算中.

In [171]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.193085,5,1,1.0
2013-01-02,0.489569,-0.122143,0.521697,5,2,1.0
2013-01-03,-2.003783,-1.286573,0.468985,5,3,
2013-01-04,0.931235,0.597973,-0.070079,5,4,


In [172]:
# To drop any rows that have missing data
# 删除任何含有丢失数据的行
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.193085,5,1,1.0
2013-01-02,0.489569,-0.122143,0.521697,5,2,1.0


In [173]:
# Filling missing data
# 填充确实数据
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.193085,5,1,1.0
2013-01-02,0.489569,-0.122143,0.521697,5,2,1.0
2013-01-03,-2.003783,-1.286573,0.468985,5,3,5.0
2013-01-04,0.931235,0.597973,-0.070079,5,4,5.0


In [174]:
# To get the boolean mask where values are nan
# 对数据进行boolean蒙版填充
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## Operations

Operations in general exclude missing data.  
一般情况下,统计数据不包括缺失数据

### Stats

In [175]:
# Performing a descriptive statistic
# 执行描述性统计
df.mean()

A   -0.319289
B   -0.172817
C    0.442694
D    5.000000
F    3.500000
dtype: float64

In [176]:
# same operation on the other axis
# 在其他轴上进行相同的操作
df.mean(1)

2013-01-01    1.438617
2013-01-02    1.577824
2013-01-03    1.035726
2013-01-04    2.091826
2013-01-05    1.946985
2013-01-06    2.049727
Freq: D, dtype: float64

In [177]:
# Operating with objects that have different dimensionality and need alignment. 
# In addition, pandas automatically broadcasts along the specified dimension.
# 对于拥有不同维度，需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播：
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [178]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-3.003783,-2.286573,-0.531015,4.0,2.0
2013-01-04,-2.068765,-2.402027,-3.070079,2.0,1.0
2013-01-05,-5.35121,-5.475518,-4.438348,0.0,0.0
2013-01-06,,,,,


### Apply

In [179]:
# Applying functions to the data
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.193085,5,1
2013-01-02,0.489569,-0.122143,1.714782,10,3
2013-01-03,-1.514215,-1.408717,2.183767,15,6
2013-01-04,-0.58298,-0.810743,2.113689,20,10
2013-01-05,-0.93419,-1.286261,2.675341,25,15
2013-01-06,-1.915736,-1.036904,2.656165,30,21


### Histogramming

In [180]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    1
1    3
2    5
3    0
4    6
5    1
6    5
7    6
8    1
9    6
dtype: int32

In [181]:
s.value_counts()

6    3
1    3
5    2
3    1
0    1
dtype: int64

### String Methods

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. 

Series对象在其str属性中配备了一组字符串处理方法，可以很容易的应用到数组中的每个元素。

In [182]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Merge

pandas provides various facilities for easily combining together Series, DataFrame, and Panel objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.

Pandas提供了大量的方法能够轻松的对Series，DataFrame和Panel对象进行各种符合各种逻辑关系的合并操作。

In [183]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.541681,-2.102198,-1.69883,0.665019
1,0.290848,-0.602589,-1.046177,1.126889
2,0.713463,0.029484,-1.454709,0.742726
3,0.831769,-0.018202,-0.681315,-0.77731
4,2.132826,-0.6485,-0.192385,-1.13593
5,-0.551579,-0.365502,-1.000481,-0.585818
6,-0.733541,-1.105228,-0.062693,0.09754
7,1.427908,-0.515324,1.501006,0.820532
8,-0.969312,0.333002,-0.002874,-0.412955
9,-0.184261,0.144791,-1.243331,-0.944108


In [184]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -0.541681 -2.102198 -1.698830  0.665019
 1  0.290848 -0.602589 -1.046177  1.126889
 2  0.713463  0.029484 -1.454709  0.742726,
           0         1         2         3
 3  0.831769 -0.018202 -0.681315 -0.777310
 4  2.132826 -0.648500 -0.192385 -1.135930
 5 -0.551579 -0.365502 -1.000481 -0.585818
 6 -0.733541 -1.105228 -0.062693  0.097540,
           0         1         2         3
 7  1.427908 -0.515324  1.501006  0.820532
 8 -0.969312  0.333002 -0.002874 -0.412955
 9 -0.184261  0.144791 -1.243331 -0.944108]

In [185]:
# Concatenating pandas objects together with concat()
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.541681,-2.102198,-1.69883,0.665019
1,0.290848,-0.602589,-1.046177,1.126889
2,0.713463,0.029484,-1.454709,0.742726
3,0.831769,-0.018202,-0.681315,-0.77731
4,2.132826,-0.6485,-0.192385,-1.13593
5,-0.551579,-0.365502,-1.000481,-0.585818
6,-0.733541,-1.105228,-0.062693,0.09754
7,1.427908,-0.515324,1.501006,0.820532
8,-0.969312,0.333002,-0.002874,-0.412955
9,-0.184261,0.144791,-1.243331,-0.944108


### Join

In [186]:
# SQL style merges.
# Join 类似于SQL类型的合并
left = pd.DataFrame({'key' : ['foo', 'foo'], 'lval' : [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [187]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval' : [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [188]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [189]:
# Another example that can be given is
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval' : [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [190]:
right = pd.DataFrame({'key' : ['foo', 'bar'], 'rval' : [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [191]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


### Append

In [192]:
# Append rows to a dataframe.
# 将一行连接到一个DataFrame上
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-1.119657,-0.989774,0.211051,-0.084762
1,1.019798,-1.280051,0.341084,0.730505
2,1.187597,0.117826,1.713219,-0.779308
3,1.653507,0.542068,0.52705,0.279163
4,0.846026,1.099225,-2.603981,-0.344621
5,-0.303694,-0.415198,0.3198,0.63023
6,0.679226,-0.386371,0.769352,0.36451
7,-1.110075,-0.012175,0.360929,-0.057613


In [193]:
s = df.iloc[3]
s

A    1.653507
B    0.542068
C    0.527050
D    0.279163
Name: 3, dtype: float64

In [194]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-1.119657,-0.989774,0.211051,-0.084762
1,1.019798,-1.280051,0.341084,0.730505
2,1.187597,0.117826,1.713219,-0.779308
3,1.653507,0.542068,0.52705,0.279163
4,0.846026,1.099225,-2.603981,-0.344621
5,-0.303694,-0.415198,0.3198,0.63023
6,0.679226,-0.386371,0.769352,0.36451
7,-1.110075,-0.012175,0.360929,-0.057613
8,1.653507,0.542068,0.52705,0.279163


## Grouping

By “group by” we are referring to a process involving one or more of the following steps

- Splitting the data into groups based on some criteria
- Applying a function to each group independently
- Combining the results into a data structure

对于”group by”操作，我们通常是指以下一个或多个操作步骤：

- （Splitting）按照一些规则将数据分为不同的组；
- （Applying）对于每组数据分别执行一个函数；
- （Combining）将结果组合到一个数据结构中；

In [195]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-2.703256,-0.470528
1,bar,one,0.2224,1.278075
2,foo,two,0.836924,0.791173
3,bar,three,0.946781,0.688094
4,foo,two,0.439966,0.881052
5,bar,two,0.633502,-0.081956
6,foo,one,0.947666,-1.894756
7,foo,three,-0.062891,-0.847751


In [196]:
# Grouping and then applying a function sum to the resulting groups.
# 分组并对每个分组执行sum函数：
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.802682,1.884214
foo,-0.541592,-1.54081


In [197]:
# Grouping by multiple columns forms a hierarchical index, which we then apply the function.
# 通过多个列进行分组形成一个层次索引，然后执行函数：
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.2224,1.278075
bar,three,0.946781,0.688094
bar,two,0.633502,-0.081956
foo,one,-1.755591,-2.365284
foo,three,-0.062891,-0.847751
foo,two,1.27689,1.672225


## Reshaping

### Stack

In [198]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                      'foo', 'foo', 'qux', 'qux'],
                     ['one', 'two', 'one', 'two',
                      'one', 'two', 'one', 'two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [199]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.031252,1.290668
bar,two,-1.136786,0.392706
baz,one,1.427254,-0.139385
baz,two,1.236617,0.220561


In [200]:
# The stack() method “compresses” a level in the DataFrame’s columns.
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.031252
               B    1.290668
       two     A   -1.136786
               B    0.392706
baz    one     A    1.427254
               B   -0.139385
       two     A    1.236617
               B    0.220561
dtype: float64

In [201]:
# With a “stacked” DataFrame or Series (having a MultiIndex as the index), 
# the inverse operation of stack() is unstack(), which by default unstacks the last level:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.031252,1.290668
bar,two,-1.136786,0.392706
baz,one,1.427254,-0.139385
baz,two,1.236617,0.220561


In [202]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.031252,-1.136786
bar,B,1.290668,0.392706
baz,A,1.427254,1.236617
baz,B,-0.139385,0.220561


In [203]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.031252,1.427254
one,B,1.290668,-0.139385
two,A,-1.136786,1.236617
two,B,0.392706,0.220561


### Pivot Tables

In [204]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.830916,-0.824909
1,one,B,foo,1.68496,-1.420044
2,two,C,foo,-1.036665,-0.164879
3,three,A,bar,0.38416,-1.054139
4,one,B,bar,0.951341,-1.462585
5,one,C,bar,-1.259763,0.130984
6,two,A,foo,0.316663,0.876511
7,three,B,foo,0.037682,0.762675
8,one,C,foo,0.577732,0.506467
9,one,A,bar,-2.681893,0.473418


In [205]:
# We can produce pivot tables from this data very easily:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-2.681893,-0.830916
one,B,0.951341,1.68496
one,C,-1.259763,0.577732
three,A,0.38416,
three,B,,0.037682
three,C,1.48711,
two,A,,0.316663
two,B,0.099608,
two,C,,-1.036665
