In [5]:
# 显示全部结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## 一、数据结构

### Series
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

#### 定义
* 包含索引（Index）的一维数组（值序列）

#### 语法
```python
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
```
**参数：**
* **data: array-like, Iterable, dict, or scalar value**
* **index: array-like or Index (1d)**
* **dtype: str, numpy.dtype, or ExtensionDtype, optional**
* **copy: bool, default False**

In [2]:
import pandas as pd
obj_1 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'], dtype=float, name='obj_1')
obj_1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: obj_1, dtype: float64

In [41]:
obj_2 = pd.Series([i**2 for i in range(10)], dtype=float, name='obj_2')
obj_2

0     0.0
1     1.0
2     4.0
3     9.0
4    16.0
5    25.0
6    36.0
7    49.0
8    64.0
9    81.0
Name: obj_2, dtype: float64

In [42]:
obj_3 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}, dtype=float, name='obj_3')
obj_3

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: obj_3, dtype: float64

### DataFrame  
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

#### 定义
* 矩阵的数据表，既有行索引，也有列索引

#### 语法
```python
pandas.DataFrame(data=None, index: Optional[Collection] = None, columns: Optional[Collection] = None, dtype: Optional[Union[str, numpy.dtype, ExtensionDtype]] = None, copy: bool = False)
```
* **data: ndarray (structured or homogeneous), Iterable, dict, or DataFrame**
* **index: Index or array-like**
* **columns: Index or array-like**
* **dtype: dtype, default None**
* **copy: bool, default False**

In [43]:
df_1 = pd.DataFrame({'城市': ['北京', '上海', '武汉', '深圳', '广州', '天津'], 
                     '人口': ['1000', '1200', '1200', '1100', '1000', '1300'],
                     '拼音名': ['beijing', 'shanghai', 'wuhan', 'shenzhen', 'guangzhou', 'tianjin']},
                     index=['北京', '上海', '武汉', '深圳', '广州', '天津'], 
                     columns=['人口', '城市', '拼音名'])
df_1

Unnamed: 0,人口,城市,拼音名
北京,1000,北京,beijing
上海,1200,上海,shanghai
武汉,1200,武汉,wuhan
深圳,1100,深圳,shenzhen
广州,1000,广州,guangzhou
天津,1300,天津,tianjin


## 二、索引

### Series

In [44]:
obj_1 = pd.Series([i**2 for i in range(10)], index=[i for i in 'abcdefghij'])
print(obj_1)
print('-'*20)

print(obj_1['b'])
print('-'*20)

print(obj_1[['a', 'b', 'j']])  # 注意多个索引时，要加[]
print('-'*20)

print(obj_1['a':'e'])
print('-'*20)

print(obj_1[3])
print('-'*20)

print(obj_1[:3])
print('-'*20)

print(obj_1[[1, 3 ,4]])
print('-'*20)

print(obj_1[obj_1 > 9])

a     0
b     1
c     4
d     9
e    16
f    25
g    36
h    49
i    64
j    81
dtype: int64
--------------------
1
--------------------
a     0
b     1
j    81
dtype: int64
--------------------
a     0
b     1
c     4
d     9
e    16
dtype: int64
--------------------
9
--------------------
a    0
b    1
c    4
dtype: int64
--------------------
b     1
d     9
e    16
dtype: int64
--------------------
e    16
f    25
g    36
h    49
i    64
j    81
dtype: int64


### DataFrame

#### 一般方式

In [7]:
import numpy as np
df_2 = pd.DataFrame(np.arange(25).reshape(5, 5),
                   index=['index_1', 'index_2', 'index_3', 'index_4', 'index_5'],
                   columns=['col_1', 'col_2', 'col_3', 'col_4', 'col_5'])
print(df_2)
print('-'*20)

# 列索引
print(df_2['col_1'])
print('-'*20)

print(df_2[['col_1', 'col_2']])
print('-'*20)


# 行索引
print(df_2['index_2':'index_4'])
print('-'*20)

print(df_2[:2])
print('-'*20)


# bool索引
print(df_2['col_3'] >= 7)
print(df_2[df_2['col_3'] >= 7])

# tips: 赋值
df_1 = pd.DataFrame(np.arange(1, 10).reshape(3, 3),
             index=['a', 'b', 'c'],
             columns=['x', 'y', 'z'])

df_1['y'][df_1['x'] == 4] = 999  # df_1[df_1['x'] == 4]['y'] = 999 就不可以 

df_1

         col_1  col_2  col_3  col_4  col_5
index_1      0      1      2      3      4
index_2      5      6      7      8      9
index_3     10     11     12     13     14
index_4     15     16     17     18     19
index_5     20     21     22     23     24
--------------------
index_1     0
index_2     5
index_3    10
index_4    15
index_5    20
Name: col_1, dtype: int32
--------------------
         col_1  col_2
index_1      0      1
index_2      5      6
index_3     10     11
index_4     15     16
index_5     20     21
--------------------
         col_1  col_2  col_3  col_4  col_5
index_2      5      6      7      8      9
index_3     10     11     12     13     14
index_4     15     16     17     18     19
--------------------
         col_1  col_2  col_3  col_4  col_5
index_1      0      1      2      3      4
index_2      5      6      7      8      9
--------------------
index_1    False
index_2     True
index_3     True
index_4     True
index_5     True
Name: col_3, dtype: boo

Unnamed: 0,x,y,z
a,1,2,3
b,4,999,6
c,7,8,9


#### df.loc[ ] 和 df.iloc[ ]
* df.loc[ ] 通过**标签名**索引  *--axis labels*
```python
df.loc[index_start:index_end, columns_start:columns_end]
```
* df.iloc[ ] 通过**整数**索引  *--integers*
```python
iloc[ind_start_num:ind_end_num, col_start_num:col_end_num]
```


In [46]:
# df.loc[]
print(df_2)
print('-'*20)

print(df_2.loc[['index_1', 'index_3'],['col_1','col_3']])
print('-'*20)

print(df_2.loc['index_1':'index_2', 'col_2':'col_4'])
print('-'*20)

print(df_2.loc['index_1', ['col_1', 'col_4']])

         col_1  col_2  col_3  col_4  col_5
index_1      0      1      2      3      4
index_2      5      6      7      8      9
index_3     10     11     12     13     14
index_4     15     16     17     18     19
index_5     20     21     22     23     24
--------------------
         col_1  col_3
index_1      0      2
index_3     10     12
--------------------
         col_2  col_3  col_4
index_1      1      2      3
index_2      6      7      8
--------------------
col_1    0
col_4    3
Name: index_1, dtype: int32


In [47]:
# df.iloc[]
print(df_2)
print('-'*20)

print(df_2.iloc[2])
print('-'*20)

print(df_2.iloc[:2, 1:3])
print('-'*20)

print(df_2.iloc[[1, 2], [3, 4]])
print('-'*20)

print(df_2.iloc[:2, 1:3][df_2.col_2 > 1])
print('-'*20)

         col_1  col_2  col_3  col_4  col_5
index_1      0      1      2      3      4
index_2      5      6      7      8      9
index_3     10     11     12     13     14
index_4     15     16     17     18     19
index_5     20     21     22     23     24
--------------------
col_1    10
col_2    11
col_3    12
col_4    13
col_5    14
Name: index_3, dtype: int32
--------------------
         col_2  col_3
index_1      1      2
index_2      6      7
--------------------
         col_4  col_5
index_2      8      9
index_3     13     14
--------------------
         col_2  col_3
index_2      6      7
--------------------


  


## 三、基本运算

### （一）Series 与 Series

In [48]:
s1 = pd.Series([i for i in range(5)], index=[j for j in 'abcde'])
s2 = pd.Series([i for i in range(5)], index=[j for j in 'cdefj'])
print('s1')
print(s1)
print('-'*50)

print('s2')
print(s2)
print('-'*50)
print('-'*50)


# =========加法=========
print('加法')
# Series + Series
# 并集，非交集部分填充为NaN，交集部分做加法
print('Series + Series')
print(s1 + s2)
print('-'*50)

# Series.add(): Equivalent to Series + other
print('Series.add()')
print(s1.add(s2, fill_value=3))  # fill_value 将NaN填空为指定数值
print('-'*50)

# # Series.radd(): Equivalent to other + Series
print('Series.radd()')
print(s1.radd(s2, fill_value=3))
print('-'*50)
print('-'*50)


# =========减法=========
print('减法')
# Series - Series
print('Series - Series')
print(s1 - s2)
print('-'*50)

# Series.sub(): Equivalent to Series - other
print('Series.sub()')
print(s1.sub(s2, fill_value=3))
print('-'*50)

# Series.rsub(): Equivalent to other - Series
print('Series.rsub()')
print(s1.rsub(s2, fill_value=3))

s1
a    0
b    1
c    2
d    3
e    4
dtype: int64
--------------------------------------------------
s2
c    0
d    1
e    2
f    3
j    4
dtype: int64
--------------------------------------------------
--------------------------------------------------
加法
Series + Series
a    NaN
b    NaN
c    2.0
d    4.0
e    6.0
f    NaN
j    NaN
dtype: float64
--------------------------------------------------
Series.add()
a    3.0
b    4.0
c    2.0
d    4.0
e    6.0
f    6.0
j    7.0
dtype: float64
--------------------------------------------------
Series.radd()
a    3.0
b    4.0
c    2.0
d    4.0
e    6.0
f    6.0
j    7.0
dtype: float64
--------------------------------------------------
--------------------------------------------------
减法
Series - Series
a    NaN
b    NaN
c    2.0
d    2.0
e    2.0
f    NaN
j    NaN
dtype: float64
--------------------------------------------------
Series.sub()
a   -3.0
b   -2.0
c    2.0
d    2.0
e    2.0
f    0.0
j   -1.0
dtype: float64
----------------------

In [49]:
print('s1')
print(s1)
print('-'*50)

print('s2')
print(s2)
print('-'*50)
print('-'*50)


# =========乘法=========
print('乘法')
# Series * Series
print('Series * Series')
print(s1 * s2)
print('-'*50)

# Series.mul(): Equivalent to Series * other
print('Series.mul()')
print(s1.mul(s2, fill_value=2))
print('-'*50)

# Series.rmul(): Equivalent to other * Series
print('Series.rmul()')
print(s1.rmul(s2, fill_value=2))
print('-'*50)
print('-'*50)


# =========除法=========
print('除法')
# Series / Series
print('Series / Series')
print(s1 / s2)
print('-'*50)

# Series.div(): Equivalent to Series / other
print('Series.div()')
print(s1.div(s2, fill_value=2))
print('-'*50)

# Series.rdiv(): Equivalent to other / Series
print('Series.rdiv()')
print(s1.rdiv(s2, fill_value=2))

s1
a    0
b    1
c    2
d    3
e    4
dtype: int64
--------------------------------------------------
s2
c    0
d    1
e    2
f    3
j    4
dtype: int64
--------------------------------------------------
--------------------------------------------------
乘法
Series * Series
a    NaN
b    NaN
c    0.0
d    3.0
e    8.0
f    NaN
j    NaN
dtype: float64
--------------------------------------------------
Series.mul()
a    0.0
b    2.0
c    0.0
d    3.0
e    8.0
f    6.0
j    8.0
dtype: float64
--------------------------------------------------
Series.rmul()
a    0.0
b    2.0
c    0.0
d    3.0
e    8.0
f    6.0
j    8.0
dtype: float64
--------------------------------------------------
--------------------------------------------------
除法
Series / Series
a    NaN
b    NaN
c    inf
d    3.0
e    2.0
f    NaN
j    NaN
dtype: float64
--------------------------------------------------
Series.div()
a    0.000000
b    0.500000
c         inf
d    3.000000
e    2.000000
f    0.666667
j    0.500000
dt

### （二）DataFrame 与 DataFrame

In [50]:
df_1 = pd.DataFrame(np.arange(25).reshape(5, 5), 
                   index=['index_{}'.format(i) for i in range(1, 6)],
                   columns=['col_{}'.format(j) for j in range(1, 6)])

df_2 = pd.DataFrame(np.arange(0, 25).reshape(5, 5), 
                   index=['index_{}'.format(i) for i in range(3, 8)],
                   columns=['col_{}'.format(j) for j in range(3, 8)])

print(df_1)
print('-'*50)
print(df_2)
print('-'*50)
print('-'*50)


# =========加法=========
print('加法')

# df + df
print('df + df')
print(df_1 + df_2)  # 并集，非交集部分填充NaN
print('-'*50)

# df.add(): Equivalent to dataframe + other
print('df.add()')
print(df_1.add(df_2, fill_value=2))  # fill_value 用指定值填充NaN，但是如果在两个DataFrame里边都缺失，则还是NaN
print('-'*50)

# df.radd(): Equivalent to other + dataframe
print('df.radd()')
print(df_1.radd(df_2, fill_value=2))
print('-'*50)
print('-'*50)


# =========减法=========
print('减法')

# df - df
print('df - df')
print(df_1 - df_2)
print('-'*50)

# df.sub(): Equivalent to dataframe - other
print('df.sub()')
print(df_1.sub(df_2, fill_value=2))
print('-'*50)

# df.rsub(): Equivalent to other - dataframe
print('df.rsub()')
print(df_1.rsub(df_2, fill_value=2))

         col_1  col_2  col_3  col_4  col_5
index_1      0      1      2      3      4
index_2      5      6      7      8      9
index_3     10     11     12     13     14
index_4     15     16     17     18     19
index_5     20     21     22     23     24
--------------------------------------------------
         col_3  col_4  col_5  col_6  col_7
index_3      0      1      2      3      4
index_4      5      6      7      8      9
index_5     10     11     12     13     14
index_6     15     16     17     18     19
index_7     20     21     22     23     24
--------------------------------------------------
--------------------------------------------------
加法
df + df
         col_1  col_2  col_3  col_4  col_5  col_6  col_7
index_1    NaN    NaN    NaN    NaN    NaN    NaN    NaN
index_2    NaN    NaN    NaN    NaN    NaN    NaN    NaN
index_3    NaN    NaN   12.0   14.0   16.0    NaN    NaN
index_4    NaN    NaN   22.0   24.0   26.0    NaN    NaN
index_5    NaN    NaN   32.0   34.0

In [51]:
# =========乘法=========
print('乘法')

# df * df
print('df * df')
print(df_1 * df_2)
print('-'*50)

# df.mul(): Equivalent to dataframe * other
print('df.mul()')
print(df_1.mul(df_2, fill_value=2))
print('-'*50)

# df.rmul(): Equivalent to other * dataframe
print('df.rmul()')
print(df_1.rmul(df_2, fill_value=2))
print('-'*50)
print('-'*50)


# =========除法=========
print('除法')

# df / df
print('df / df')
print(df_1 / df_2)
print('-'*50)

# df.div(): Equivalent to dataframe / other
print('df.div()')
print(df_1.div(df_2, fill_value=2))
print('-'*50)

# df.rdiv(): Equivalent to other / dataframe
print('df.rdiv()')
print(df_1.rdiv(df_2, fill_value=2))

乘法
df * df
         col_1  col_2  col_3  col_4  col_5  col_6  col_7
index_1    NaN    NaN    NaN    NaN    NaN    NaN    NaN
index_2    NaN    NaN    NaN    NaN    NaN    NaN    NaN
index_3    NaN    NaN    0.0   13.0   28.0    NaN    NaN
index_4    NaN    NaN   85.0  108.0  133.0    NaN    NaN
index_5    NaN    NaN  220.0  253.0  288.0    NaN    NaN
index_6    NaN    NaN    NaN    NaN    NaN    NaN    NaN
index_7    NaN    NaN    NaN    NaN    NaN    NaN    NaN
--------------------------------------------------
df.mul()
         col_1  col_2  col_3  col_4  col_5  col_6  col_7
index_1    0.0    2.0    4.0    6.0    8.0    NaN    NaN
index_2   10.0   12.0   14.0   16.0   18.0    NaN    NaN
index_3   20.0   22.0    0.0   13.0   28.0    6.0    8.0
index_4   30.0   32.0   85.0  108.0  133.0   16.0   18.0
index_5   40.0   42.0  220.0  253.0  288.0   26.0   28.0
index_6    NaN    NaN   30.0   32.0   34.0   36.0   38.0
index_7    NaN    NaN   40.0   42.0   44.0   46.0   48.0
-----------------

### （三）DataFrame 与 Series

In [52]:
df_1 = pd.DataFrame(np.arange(25).reshape(5, 5), 
                   index=['index_{}'.format(i) for i in range(1, 6)],
                   columns=['col_{}'.format(j) for j in range(1, 6)])

s_1 = pd.Series([i for i in range(4)], index=['col_{}'.format(j) for j in range(1, 5)])

print(df_1)
print('-'*50)
print(s_1)
print('-'*50)
print('-'*50)


# =========加法=========
# df + s
print(df_1 + s_1)
print('-'*50)

print(df_1['index_1':'index_2'] + s_1)
print('-'*50)

# df.add(s, axis='index' or 'columns')
print(df_1.add(s_1, axis='index'))
print('-'*50)
print(df_1.add(s_1, axis='columns'))


# =========其他类比=========

         col_1  col_2  col_3  col_4  col_5
index_1      0      1      2      3      4
index_2      5      6      7      8      9
index_3     10     11     12     13     14
index_4     15     16     17     18     19
index_5     20     21     22     23     24
--------------------------------------------------
col_1    0
col_2    1
col_3    2
col_4    3
dtype: int64
--------------------------------------------------
--------------------------------------------------
         col_1  col_2  col_3  col_4  col_5
index_1    0.0    2.0    4.0    6.0    NaN
index_2    5.0    7.0    9.0   11.0    NaN
index_3   10.0   12.0   14.0   16.0    NaN
index_4   15.0   17.0   19.0   21.0    NaN
index_5   20.0   22.0   24.0   26.0    NaN
--------------------------------------------------
         col_1  col_2  col_3  col_4  col_5
index_1    0.0    2.0    4.0    6.0    NaN
index_2    5.0    7.0    9.0   11.0    NaN
--------------------------------------------------
         col_1  col_2  col_3  col_4  col_5


### （四）更多运算可参考官方文档
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

## 四、排序与排名

#### （一）Series

In [53]:
s_1 = pd.Series([1, 4, 8, 32, 4], index=[j for j in 'cdeab'])

# Series.sort_index()
print('Series.sort_index()')
print('-'*9 + 'before' + '-'*9)
print(s_1)
print('-'*9 + 'after_1' + '-'*9)
print(s_1.sort_index())
print('-'*9 + 'after_2' + '-'*9)
print(s_1.sort_index(ascending=False))

print('='*50)

# Series.sort_values()
print('Series.sort_values()')
print('-'*9 + 'before' + '-'*9)
print(s_1)
print('-'*9 + 'after' + '-'*9)
print(s_1.sort_values(ascending=False))

print('='*50)

# Series.rank()
print('Series.rank()')
print('-'*9 + 'before' + '-'*9)
print(s_1)
print('-'*9 + 'after_1' + '-'*9)
print(s_1.rank(method='average'))
print('-'*9 + 'after_2' + '-'*9)
print(s_1.rank(method='first')) # method='first' 当数据出现重复时，按照出现的次序分配排名

Series.sort_index()
---------before---------
c     1
d     4
e     8
a    32
b     4
dtype: int64
---------after_1---------
a    32
b     4
c     1
d     4
e     8
dtype: int64
---------after_2---------
e     8
d     4
c     1
b     4
a    32
dtype: int64
Series.sort_values()
---------before---------
c     1
d     4
e     8
a    32
b     4
dtype: int64
---------after---------
a    32
e     8
b     4
d     4
c     1
dtype: int64
Series.rank()
---------before---------
c     1
d     4
e     8
a    32
b     4
dtype: int64
---------after_1---------
c    1.0
d    2.5
e    4.0
a    5.0
b    2.5
dtype: float64
---------after_2---------
c    1.0
d    2.0
e    4.0
a    5.0
b    3.0
dtype: float64


In [54]:
df_1 = pd.DataFrame({'col_1': [4, 3, 2, 3, 2, 1], 
                     'col_2': [5, 4, 3, 4, 3, 2], 
                     'col_3': [5, 4, 2, 3, 2, 1],
                     'col_4': [7, 6, 4, 5, 5, 6]},
                     index=['index_{}'.format(i) for i in range(1, 7)])

# df.sort_index()
print('df.sort_index()')
print('-'*9 + 'before' + '-'*9)
print(df_1)
print('-'*9 + 'after_1' + '-'*9)
print(df_1.sort_index(axis=1, ascending=False))  # axis=1 按照columns排序
print('-'*9 + 'after_2' + '-'*9)
print(df_1.sort_index(axis=0, ascending=False))  # axis=0 按照index排序

print('='*50)

# df.sort_values()
print('df.sort_values()')
print('-'*9 + 'before' + '-'*9)
print(df_1)
print('-'*9 + 'after_1' + '-'*9)
print(df_1.sort_values(by=['col_4', 'col_2'], axis=0, ascending=False)) 
print('-'*9 + 'after_2' + '-'*9)
print(df_1.sort_values(by='index_2', axis=1, ascending=False))

print('='*50)

# df.rank()
print('df.rank()')
print('-'*9 + 'before' + '-'*9)
print(df_1)
print('-'*9 + 'after_1' + '-'*9)
print(df_1.rank(axis=1, method='first'))  # method='first' 当数据出现重复时，按照出现的次序分配排名
print('-'*9 + 'after_2' + '-'*9)
print(df_1.rank(axis=0, method='min'))  # method='min' 当数据出现重复时间，按照小的序号对所有的数据分配名次
print('-'*9 + 'after_3' + '-'*9)
print(df_1.rank(axis=0, method='max'))  # method='max' 当数据出现重复时间，按照大的序号对所有的数据分配名次
print('-'*9 + 'after_4' + '-'*9)
print(df_1.rank(axis=0, method='average'))  # method='average' 当数据出现重复时间，按照序号平均值对所有的数据分配名次

df.sort_index()
---------before---------
         col_1  col_2  col_3  col_4
index_1      4      5      5      7
index_2      3      4      4      6
index_3      2      3      2      4
index_4      3      4      3      5
index_5      2      3      2      5
index_6      1      2      1      6
---------after_1---------
         col_4  col_3  col_2  col_1
index_1      7      5      5      4
index_2      6      4      4      3
index_3      4      2      3      2
index_4      5      3      4      3
index_5      5      2      3      2
index_6      6      1      2      1
---------after_2---------
         col_1  col_2  col_3  col_4
index_6      1      2      1      6
index_5      2      3      2      5
index_4      3      4      3      5
index_3      2      3      2      4
index_2      3      4      4      6
index_1      4      5      5      7
df.sort_values()
---------before---------
         col_1  col_2  col_3  col_4
index_1      4      5      5      7
index_2      3      4      4      6
i

## 五、描述性统计

In [55]:
df_1 = pd.DataFrame(np.arange(81).reshape(9, 9),
                    index=['index_{}'.format(i) for i in range(1, 10)],
                    columns=['col_{}'.format(j) for j in range(1, 10)])
print(df_1)

         col_1  col_2  col_3  col_4  col_5  col_6  col_7  col_8  col_9
index_1      0      1      2      3      4      5      6      7      8
index_2      9     10     11     12     13     14     15     16     17
index_3     18     19     20     21     22     23     24     25     26
index_4     27     28     29     30     31     32     33     34     35
index_5     36     37     38     39     40     41     42     43     44
index_6     45     46     47     48     49     50     51     52     53
index_7     54     55     56     57     58     59     60     61     62
index_8     63     64     65     66     67     68     69     70     71
index_9     72     73     74     75     76     77     78     79     80


In [56]:
# 显示结构
df_1.shape

(9, 9)

In [57]:
# 显示数据概览
df_1.head(6)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
index_1,0,1,2,3,4,5,6,7,8
index_2,9,10,11,12,13,14,15,16,17
index_3,18,19,20,21,22,23,24,25,26
index_4,27,28,29,30,31,32,33,34,35
index_5,36,37,38,39,40,41,42,43,44
index_6,45,46,47,48,49,50,51,52,53


In [58]:
# 显示数据信息概览
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, index_1 to index_9
Data columns (total 9 columns):
col_1    9 non-null int32
col_2    9 non-null int32
col_3    9 non-null int32
col_4    9 non-null int32
col_5    9 non-null int32
col_6    9 non-null int32
col_7    9 non-null int32
col_8    9 non-null int32
col_9    9 non-null int32
dtypes: int32(9)
memory usage: 396.0+ bytes


In [59]:
# 求和
print(df_1.sum())
print('-'*50)
print(df_1.sum(axis=0))
print('-'*50)
print(df_1.sum(axis=1))

col_1    324
col_2    333
col_3    342
col_4    351
col_5    360
col_6    369
col_7    378
col_8    387
col_9    396
dtype: int64
--------------------------------------------------
col_1    324
col_2    333
col_3    342
col_4    351
col_5    360
col_6    369
col_7    378
col_8    387
col_9    396
dtype: int64
--------------------------------------------------
index_1     36
index_2    117
index_3    198
index_4    279
index_5    360
index_6    441
index_7    522
index_8    603
index_9    684
dtype: int64


In [60]:
# 均值
print(df_1.mean())
print('-'*50)
print(df_1.sum()/df_1.count())

col_1    36.0
col_2    37.0
col_3    38.0
col_4    39.0
col_5    40.0
col_6    41.0
col_7    42.0
col_8    43.0
col_9    44.0
dtype: float64
--------------------------------------------------
col_1    36.0
col_2    37.0
col_3    38.0
col_4    39.0
col_5    40.0
col_6    41.0
col_7    42.0
col_8    43.0
col_9    44.0
dtype: float64


In [61]:
# 计数
df_1.count()

col_1    9
col_2    9
col_3    9
col_4    9
col_5    9
col_6    9
col_7    9
col_8    9
col_9    9
dtype: int64

In [62]:
# 显示常用的描述性统计指标
df_1.describe()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0
std,24.647515,24.647515,24.647515,24.647515,24.647515,24.647515,24.647515,24.647515,24.647515
min,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
25%,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0
50%,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0
75%,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0
max,72.0,73.0,74.0,75.0,76.0,77.0,78.0,79.0,80.0


In [63]:
# 相关性系数
df_1.corr()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
col_1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
col_9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
# 协方差
df_1.cov()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
col_1,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_2,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_3,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_4,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_6,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_7,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_8,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5
col_9,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5,607.5


In [65]:
# 计算与某一个col的相关性系数
df_1.corrwith(df_1['col_5'])

col_1    1.0
col_2    1.0
col_3    1.0
col_4    1.0
col_5    1.0
col_6    1.0
col_7    1.0
col_8    1.0
col_9    1.0
dtype: float64

## 六、apply函数初识

**语法：**
```DataFrame.apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds)```

In [66]:
df_1 = pd.DataFrame(np.arange(81).reshape(9, 9),
                    index=['index_{}'.format(i) for i in range(1, 10)],
                    columns=['col_{}'.format(j) for j in range(1, 10)])
print(df_1)

         col_1  col_2  col_3  col_4  col_5  col_6  col_7  col_8  col_9
index_1      0      1      2      3      4      5      6      7      8
index_2      9     10     11     12     13     14     15     16     17
index_3     18     19     20     21     22     23     24     25     26
index_4     27     28     29     30     31     32     33     34     35
index_5     36     37     38     39     40     41     42     43     44
index_6     45     46     47     48     49     50     51     52     53
index_7     54     55     56     57     58     59     60     61     62
index_8     63     64     65     66     67     68     69     70     71
index_9     72     73     74     75     76     77     78     79     80


In [67]:
# 例1：
func = lambda x: x.max() - x.min()
df_1.apply(func, axis=1)

index_1    8
index_2    8
index_3    8
index_4    8
index_5    8
index_6    8
index_7    8
index_8    8
index_9    8
dtype: int64

In [8]:
# 例2：
def func(x):
    return pd.Series([x.max(), x.min()], index=['max', 'min'])

df_1.apply(func)

Unnamed: 0,x,y,z
max,7,999,9
min,1,2,3


In [12]:
# 例3：
df_1 = pd.DataFrame(np.arange(1, 10).reshape(3, 3),
                    index=['a', 'b', 'c'],
                    columns=['x', 'y', 'z'])
df_1


def func(df):
    if df['x'] > 4:
        df['y'] = 999
    else:
        df['y'] = 666
    return df


df_1.apply(func, axis=1)

Unnamed: 0,x,y,z
a,1,2,3
b,4,5,6
c,7,8,9


Unnamed: 0,x,y,z
a,1,666,3
b,4,666,6
c,7,999,9
