In [3]:
import numpy as np
import pandas as pd
import re

In [4]:
# 显示全部结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## 一、缺失值的处理

In [377]:
# 生成一个有缺失值的DataFrame
df_1 = pd.DataFrame(np.array([np.random.choice((np.random.randint(1, 10),np.nan))  for i in range(1000)]).reshape(50,20), 
                    columns=['col_{}'.format(i) for i in range(1,21)],
                    index=['index_{}'.format(j) for j in range(1,51)])

df_2 = pd.DataFrame(np.array([np.random.randint(1,10) for i in range(400)]).reshape(20,20), 
                    columns=['col_{}'.format(i) for i in range(1,21)],
                    index=['index_{}'.format(j) for j in range(51,71)])

df_3 = df_1.append(df_2)

df_3.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,7.0,6.0,2.0,,,3.0,,8.0,,,2.0,3.0,,,,1.0,,4.0,7.0,
index_2,6.0,4.0,,,4.0,,1.0,3.0,6.0,,,4.0,,7.0,9.0,,6.0,9.0,7.0,
index_3,,,2.0,3.0,,3.0,9.0,,,,,6.0,,8.0,,,4.0,,,
index_4,2.0,,,,1.0,,3.0,,,2.0,4.0,,,3.0,,,5.0,7.0,,
index_5,,8.0,8.0,,8.0,3.0,2.0,,,,,5.0,9.0,,4.0,5.0,8.0,,3.0,


### （一）isnull()

* 判断是否为空值，返回bool值

In [378]:
df_isnull = df_3.isnull()
df_isnull.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,False,False,False,True,True,False,True,False,True,True,False,False,True,True,True,False,True,False,False,True
index_2,False,False,True,True,False,True,False,False,False,True,True,False,True,False,False,True,False,False,False,True
index_3,True,True,False,False,True,False,False,True,True,True,True,False,True,False,True,True,False,True,True,True
index_4,False,True,True,True,False,True,False,True,True,False,False,True,True,False,True,True,False,False,True,True
index_5,True,False,False,True,False,False,False,True,True,True,True,False,False,True,False,False,False,True,False,True


In [379]:
# 计算缺失率
df_isnull.sum()/df_isnull.count()

col_1     0.342857
col_2     0.414286
col_3     0.314286
col_4     0.371429
col_5     0.328571
col_6     0.328571
col_7     0.428571
col_8     0.328571
col_9     0.357143
col_10    0.342857
col_11    0.357143
col_12    0.314286
col_13    0.357143
col_14    0.357143
col_15    0.371429
col_16    0.357143
col_17    0.414286
col_18    0.342857
col_19    0.371429
col_20    0.357143
dtype: float64

### （二）notnull()

* isnull()的反函数

In [380]:
df_3.notnull().head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,True,True,True,False,False,True,False,True,False,False,True,True,False,False,False,True,False,True,True,False
index_2,True,True,False,False,True,False,True,True,True,False,False,True,False,True,True,False,True,True,True,False
index_3,False,False,True,True,False,True,True,False,False,False,False,True,False,True,False,False,True,False,False,False
index_4,True,False,False,False,True,False,True,False,False,True,True,False,False,True,False,False,True,True,False,False
index_5,False,True,True,False,True,True,True,False,False,False,False,True,True,False,True,True,True,False,True,False


### （三）dropna()

```python
DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
```
> * **axis：** 确定是删除包含缺失值的行还是列。axis=0 删除行；axis=1 删除列；默认为0；
* **how：** 是删除包含缺失值的行（列）还是删除全部是缺失值的行（列）。how='any' 删除包含缺失值的行（列）；how='all' 删除全部是缺失值的行（列）；默认为'any'；
* **thresh：** 保留那些至少有指定整数个非NaN的行（列）；
* **subset：** 保留指定列均不为NaN的行；
* **inplace：** 是否在原始数据上修改。True 在原始数据上直接修改；False 不在原始数据上直接修改；默认为False。

In [381]:
df_3.dropna(axis=0, how='any', thresh=3, subset=['col_1', 'col_2', 'col_3'],inplace=False)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,7.0,6.0,2.0,,,3.0,,8.0,,,2.0,3.0,,,,1.0,,4.0,7.0,
index_32,3.0,3.0,2.0,8.0,,,3.0,8.0,1.0,,7.0,5.0,,8.0,,,9.0,,,1.0
index_34,4.0,6.0,1.0,3.0,,,,,,2.0,3.0,,,5.0,,,6.0,8.0,,3.0
index_49,3.0,8.0,2.0,1.0,,,,4.0,,6.0,,3.0,,,7.0,,,,2.0,6.0
index_51,2.0,8.0,9.0,4.0,4.0,5.0,9.0,1.0,5.0,1.0,3.0,6.0,3.0,4.0,4.0,7.0,1.0,2.0,8.0,5.0
index_52,3.0,5.0,4.0,4.0,8.0,9.0,3.0,9.0,1.0,3.0,6.0,8.0,9.0,7.0,1.0,5.0,1.0,9.0,9.0,4.0
index_53,4.0,3.0,6.0,9.0,7.0,1.0,1.0,8.0,9.0,8.0,2.0,2.0,8.0,2.0,1.0,8.0,1.0,5.0,8.0,6.0
index_54,2.0,7.0,8.0,3.0,2.0,3.0,2.0,2.0,6.0,5.0,4.0,4.0,3.0,2.0,4.0,1.0,6.0,9.0,4.0,3.0
index_55,6.0,8.0,6.0,4.0,2.0,1.0,7.0,4.0,2.0,2.0,9.0,2.0,7.0,5.0,5.0,4.0,1.0,4.0,4.0,5.0
index_56,3.0,7.0,4.0,8.0,9.0,9.0,5.0,9.0,4.0,1.0,4.0,1.0,9.0,9.0,8.0,4.0,7.0,2.0,5.0,1.0


### （四）fillna()

```python
DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)
```
> * **value：** 填充值。可以为：标量、字典、Series、DataFrame；
* **method：** 填充方法。'backfill' 用后面的值向前填充； 'bfill' 用后面的值向前填充； 'pad' 用前面的值向后填充； 'ffill' 用前面的值向后填充； None；
* **axis：** 进行填充的方向轴。0 index；1 columns；默认0；
* **inplace：** 是否在原始数据上修改。True 在原始数据上直接修改；False 不在原始数据上直接修改；默认为False；
* **limit：** 如果指定了method，则这是要向前/向后填充的连续NaN值的最大数量。换句话说，如果存在连续的NaN数量大于此数量，则只会部分填充该数量的NaN。如果未指定method，则这是整个轴上将填充NaN的最大条目数。如果不为None，则必须大于0；
* **downcast：** 略。

In [382]:
df_3.head(10)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,7.0,6.0,2.0,,,3.0,,8.0,,,2.0,3.0,,,,1.0,,4.0,7.0,
index_2,6.0,4.0,,,4.0,,1.0,3.0,6.0,,,4.0,,7.0,9.0,,6.0,9.0,7.0,
index_3,,,2.0,3.0,,3.0,9.0,,,,,6.0,,8.0,,,4.0,,,
index_4,2.0,,,,1.0,,3.0,,,2.0,4.0,,,3.0,,,5.0,7.0,,
index_5,,8.0,8.0,,8.0,3.0,2.0,,,,,5.0,9.0,,4.0,5.0,8.0,,3.0,
index_6,,,,,8.0,5.0,,8.0,,4.0,,,,6.0,3.0,,3.0,,7.0,8.0
index_7,4.0,9.0,,,,,,,,1.0,5.0,7.0,6.0,,,,8.0,6.0,5.0,6.0
index_8,,,1.0,4.0,2.0,1.0,,,5.0,3.0,,,,,9.0,,,,8.0,4.0
index_9,,,8.0,7.0,,,3.0,,,,,,,3.0,,9.0,,5.0,,5.0
index_10,5.0,2.0,,4.0,9.0,7.0,7.0,,7.0,4.0,5.0,,4.0,,,8.0,,3.0,2.0,5.0


In [383]:
df_3_1 = df_3.fillna(df_3.mean(), axis=0, inplace=False, limit=6)
df_3_1.head(10)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,7.0,6.0,2.0,5.431818,4.87234,3.0,5.025,8.0,4.955556,4.26087,2.0,3.0,5.533333,5.488889,5.545455,1.0,5.073171,4.0,7.0,4.511111
index_2,6.0,4.0,5.0625,5.431818,4.0,4.021277,1.0,3.0,6.0,4.26087,5.2,4.0,5.533333,7.0,9.0,4.955556,6.0,9.0,7.0,4.511111
index_3,4.304348,5.0,2.0,3.0,4.87234,3.0,9.0,4.893617,4.955556,4.26087,5.2,6.0,5.533333,8.0,5.545455,4.955556,4.0,5.434783,5.363636,4.511111
index_4,2.0,5.0,5.0625,5.431818,1.0,4.021277,3.0,4.893617,4.955556,2.0,4.0,4.770833,5.533333,3.0,5.545455,4.955556,5.0,7.0,5.363636,4.511111
index_5,4.304348,8.0,8.0,5.431818,8.0,3.0,2.0,4.893617,4.955556,4.26087,5.2,5.0,9.0,5.488889,4.0,5.0,8.0,5.434783,3.0,4.511111
index_6,4.304348,5.0,5.0625,5.431818,8.0,5.0,5.025,8.0,4.955556,4.0,5.2,4.770833,5.533333,6.0,3.0,4.955556,3.0,5.434783,7.0,8.0
index_7,4.0,9.0,5.0625,5.431818,4.87234,4.021277,5.025,4.893617,4.955556,1.0,5.0,7.0,6.0,5.488889,5.545455,4.955556,8.0,6.0,5.0,6.0
index_8,4.304348,5.0,1.0,4.0,2.0,1.0,5.025,4.893617,5.0,3.0,5.2,4.770833,5.533333,5.488889,9.0,4.955556,5.073171,5.434783,8.0,4.0
index_9,4.304348,5.0,8.0,7.0,4.87234,4.021277,3.0,4.893617,,4.26087,5.2,4.770833,,3.0,5.545455,9.0,5.073171,5.0,5.363636,5.0
index_10,5.0,2.0,5.0625,4.0,9.0,7.0,7.0,,7.0,4.0,5.0,4.770833,4.0,5.488889,5.545455,8.0,5.073171,3.0,2.0,5.0


In [384]:
df_3_2 = df_3.fillna(method='bfill', axis=0, inplace=False, limit=4)
df_3_2.head(10)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
index_1,7.0,6.0,2.0,3.0,4.0,3.0,1.0,8.0,6.0,2.0,2.0,3.0,9.0,7.0,9.0,1.0,6.0,4.0,7.0,
index_2,6.0,4.0,2.0,3.0,4.0,3.0,1.0,3.0,6.0,2.0,4.0,4.0,9.0,7.0,9.0,5.0,6.0,9.0,7.0,8.0
index_3,2.0,8.0,2.0,3.0,1.0,3.0,9.0,8.0,,2.0,4.0,6.0,9.0,8.0,4.0,5.0,4.0,7.0,3.0,8.0
index_4,2.0,8.0,8.0,4.0,1.0,3.0,3.0,8.0,5.0,2.0,4.0,5.0,9.0,3.0,4.0,5.0,5.0,7.0,3.0,8.0
index_5,4.0,8.0,8.0,4.0,8.0,3.0,2.0,8.0,5.0,4.0,5.0,5.0,9.0,6.0,4.0,5.0,8.0,6.0,3.0,8.0
index_6,4.0,9.0,1.0,4.0,8.0,5.0,3.0,8.0,5.0,4.0,5.0,7.0,6.0,6.0,3.0,9.0,3.0,6.0,7.0,8.0
index_7,4.0,9.0,1.0,4.0,2.0,1.0,3.0,2.0,5.0,1.0,5.0,7.0,6.0,3.0,9.0,9.0,8.0,6.0,5.0,6.0
index_8,5.0,2.0,1.0,4.0,2.0,1.0,3.0,2.0,5.0,3.0,5.0,,4.0,3.0,9.0,9.0,9.0,5.0,8.0,4.0
index_9,5.0,2.0,8.0,7.0,9.0,7.0,3.0,2.0,7.0,4.0,5.0,3.0,4.0,3.0,6.0,9.0,9.0,5.0,2.0,5.0
index_10,5.0,2.0,6.0,4.0,9.0,7.0,7.0,2.0,7.0,4.0,5.0,3.0,4.0,7.0,6.0,8.0,9.0,3.0,2.0,5.0


## 二、数据转换

### （一）去除重复值

#### 1. duplicated()

* 显示重复值

In [385]:
df_1 = pd.DataFrame(np.array([i for i in range(100)]).reshape(10,10),
                    index=['index_{}'.format(i) for i in range(10)],
                    columns=['col_{}'.format(j) for j in range(10)])

df_dup = df_1.append(df_1.iloc[:5, :]).append(df_1.iloc[3:7, :])

df_dup

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
index_0,0,1,2,3,4,5,6,7,8,9
index_1,10,11,12,13,14,15,16,17,18,19
index_2,20,21,22,23,24,25,26,27,28,29
index_3,30,31,32,33,34,35,36,37,38,39
index_4,40,41,42,43,44,45,46,47,48,49
index_5,50,51,52,53,54,55,56,57,58,59
index_6,60,61,62,63,64,65,66,67,68,69
index_7,70,71,72,73,74,75,76,77,78,79
index_8,80,81,82,83,84,85,86,87,88,89
index_9,90,91,92,93,94,95,96,97,98,99


In [386]:
# 随机打乱行
df_dup = df_dup.sample(frac=1)

In [387]:
df_dup.duplicated()

index_4    False
index_2    False
index_0    False
index_1    False
index_3    False
index_3     True
index_5    False
index_6    False
index_4     True
index_8    False
index_3     True
index_9    False
index_6     True
index_4     True
index_0     True
index_7    False
index_5     True
index_1     True
index_2     True
dtype: bool

In [388]:
df_dup.duplicated().sum()/df_dup.count()

col_0    0.473684
col_1    0.473684
col_2    0.473684
col_3    0.473684
col_4    0.473684
col_5    0.473684
col_6    0.473684
col_7    0.473684
col_8    0.473684
col_9    0.473684
dtype: float64

#### 2. drop_duplicates()

```python
drop_duplicates(self, subset, keep, inplace, ignore_index)
```
> * **subset:** 指定要进行去重的列名，默认为所有列；
> * **keep:**
    * 'first': 删除第一次出现的后面的所有重复项；
    * 'last': 删除最后一次出现的前面的所有重复项；
    * False: 删除所有重复项;
    * 默认'first'
> * **inplace:**
    * False: 不改变原数据，返回一个经过副本
    * True: 直接在原数据上进行删除
    * 默认False
> * **ignore_index:** (version 1.0.0新增)
    * False: 不重新恢复索引
    * True: 重新恢复索引
    * 默认False


In [389]:
# pip install --upgrade pandas  

In [390]:
df_dup

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
index_4,40,41,42,43,44,45,46,47,48,49
index_2,20,21,22,23,24,25,26,27,28,29
index_0,0,1,2,3,4,5,6,7,8,9
index_1,10,11,12,13,14,15,16,17,18,19
index_3,30,31,32,33,34,35,36,37,38,39
index_3,30,31,32,33,34,35,36,37,38,39
index_5,50,51,52,53,54,55,56,57,58,59
index_6,60,61,62,63,64,65,66,67,68,69
index_4,40,41,42,43,44,45,46,47,48,49
index_8,80,81,82,83,84,85,86,87,88,89


In [391]:
df_dup_copy = df_dup.drop_duplicates(keep='first')
df_dup_copy

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
index_4,40,41,42,43,44,45,46,47,48,49
index_2,20,21,22,23,24,25,26,27,28,29
index_0,0,1,2,3,4,5,6,7,8,9
index_1,10,11,12,13,14,15,16,17,18,19
index_3,30,31,32,33,34,35,36,37,38,39
index_5,50,51,52,53,54,55,56,57,58,59
index_6,60,61,62,63,64,65,66,67,68,69
index_8,80,81,82,83,84,85,86,87,88,89
index_9,90,91,92,93,94,95,96,97,98,99
index_7,70,71,72,73,74,75,76,77,78,79


* **在删除重复值后要重新设置一下index，以下为重新设置index的步骤**

In [392]:
df_dup_copy.index = ['index_{}'.format(i) for i in range(df_dup_copy.shape[0])]

In [393]:
df_dup_copy

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
index_0,40,41,42,43,44,45,46,47,48,49
index_1,20,21,22,23,24,25,26,27,28,29
index_2,0,1,2,3,4,5,6,7,8,9
index_3,10,11,12,13,14,15,16,17,18,19
index_4,30,31,32,33,34,35,36,37,38,39
index_5,50,51,52,53,54,55,56,57,58,59
index_6,60,61,62,63,64,65,66,67,68,69
index_7,80,81,82,83,84,85,86,87,88,89
index_8,90,91,92,93,94,95,96,97,98,99
index_9,70,71,72,73,74,75,76,77,78,79


### （二）映射 —— Series.map()  

**参考：**https://zhuanlan.zhihu.com/p/100064394

**链接：** 
> * Series.apply() 
> * DataFrame.apply()  
> * DataFrame.applymap()

In [474]:
df_1 = pd.DataFrame({'食物': ['__酱猪肝', '__凉拌猪耳朵', '__辣炒鸡心', '__番茄牛腩', '__麻辣兔头',
                            '__泡椒风爪', '__可乐鸡翅', '__菲力牛排', '__红烧兔肉', '__驴肉火烧', '__牛杂汤'],
                     '价格': [23, 18, 20, 25, 32, 16, 23, 158, 56, 18, 25]})
df_1 = df_1.append(df_1,ignore_index=True).append(df_1,ignore_index=True).append(df_1,ignore_index=True)
df_1.head(10)

Unnamed: 0,食物,价格
0,__酱猪肝,23
1,__凉拌猪耳朵,18
2,__辣炒鸡心,20
3,__番茄牛腩,25
4,__麻辣兔头,32
5,__泡椒风爪,16
6,__可乐鸡翅,23
7,__菲力牛排,158
8,__红烧兔肉,56
9,__驴肉火烧,18


In [395]:
food_to_animal = {'酱猪肝': '猪',
                  '凉拌猪耳朵': '猪',
                  '辣炒鸡心': '鸡',
                  '番茄牛腩': '牛',
                  '麻辣兔头': '兔',
                  '泡椒风爪': '鸡',
                  '可乐鸡翅': '鸡',
                  '菲力牛排': '牛',
                  '红烧兔肉': '兔',
                  '驴肉火烧': '驴',
                  '牛杂汤': '牛'}

df_1_re = df_1['食物'].str.replace('_', '')  # 由于df_1['食物']中的元素前面有'__'的不相关字符，不去除，会导致map映射不上

df_1['动物'] = df_1_re.map(food_to_animal)

df_1.head(10)

Unnamed: 0,食物,价格,动物
0,__酱猪肝,23,猪
1,__凉拌猪耳朵,18,猪
2,__辣炒鸡心,20,鸡
3,__番茄牛腩,25,牛
4,__麻辣兔头,32,兔
5,__泡椒风爪,16,鸡
6,__可乐鸡翅,23,鸡
7,__菲力牛排,158,牛
8,__红烧兔肉,56,兔
9,__驴肉火烧,18,驴


In [396]:
df_1.drop(columns='动物').head(10)  # 将上面的结果先删除

df_1['食物'].map(lambda x: food_to_animal[x.replace('_','')]).head(10)  # 直接将替换'__'的操作放到map函数内部

df_1.head(10)

Unnamed: 0,食物,价格
0,__酱猪肝,23
1,__凉拌猪耳朵,18
2,__辣炒鸡心,20
3,__番茄牛腩,25
4,__麻辣兔头,32
5,__泡椒风爪,16
6,__可乐鸡翅,23
7,__菲力牛排,158
8,__红烧兔肉,56
9,__驴肉火烧,18


0    猪
1    猪
2    鸡
3    牛
4    兔
5    鸡
6    鸡
7    牛
8    兔
9    驴
Name: 食物, dtype: object

Unnamed: 0,食物,价格,动物
0,__酱猪肝,23,猪
1,__凉拌猪耳朵,18,猪
2,__辣炒鸡心,20,鸡
3,__番茄牛腩,25,牛
4,__麻辣兔头,32,兔
5,__泡椒风爪,16,鸡
6,__可乐鸡翅,23,鸡
7,__菲力牛排,158,牛
8,__红烧兔肉,56,兔
9,__驴肉火烧,18,驴


### （三）替代值

In [397]:
df_1 = pd.DataFrame(np.array([2, 3, 4, 5, 999, 999, 999, 998, 998, 6, 7, 8]).reshape(3,4),
                    index=['index_{}'.format(i) for i in range(1, 4)],
                    columns=['col_{}'.format(j) for j in range(1, 5)])
df_1

Unnamed: 0,col_1,col_2,col_3,col_4
index_1,2,3,4,5
index_2,999,999,999,998
index_3,998,6,7,8


In [398]:
df_2 = df_1.replace([999, 998],2)
df_2
print('-'*50)

df_2 = df_1.replace([999, 998],[2, 3])
df_2

Unnamed: 0,col_1,col_2,col_3,col_4
index_1,2,3,4,5
index_2,2,2,2,2
index_3,2,6,7,8


--------------------------------------------------


Unnamed: 0,col_1,col_2,col_3,col_4
index_1,2,3,4,5
index_2,2,2,2,3
index_3,3,6,7,8


### （四）重命名轴索引

In [399]:
df_1 = pd.DataFrame(np.array([2, 3, 4, 5, 999, 999, 999, 998, 998, 6, 7, 8]).reshape(3,4),
                    index=['index_{}'.format(i) for i in range(1, 4)],
                    columns=['col_{}'.format(j) for j in range(1, 5)])
df_1

Unnamed: 0,col_1,col_2,col_3,col_4
index_1,2,3,4,5
index_2,999,999,999,998
index_3,998,6,7,8


In [400]:
df_1.rename(index=str.title, columns=str.upper)

Unnamed: 0,COL_1,COL_2,COL_3,COL_4
Index_1,2,3,4,5
Index_2,999,999,999,998
Index_3,998,6,7,8


In [401]:
df_1.rename(index={'index_1': 'new_index_1'}, columns={'col_2': 'new_col_2'})

Unnamed: 0,col_1,new_col_2,col_3,col_4
new_index_1,2,3,4,5
index_2,999,999,999,998
index_3,998,6,7,8


### （五）离散化与分箱

#### 1. 等频分箱

```python
pandas.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
```
> * **x：**要进行分箱的 1d ndarray 或者 Series
> * **q：**要分的箱数
> * **labels：**生成的箱子的标签，要和箱子数相等
> * **rebins：**
    * False：不同时返回结构为：索引为样本索引，元素为分到的箱子的Series（默认）
    * True：返回
> * **precision：**精度
> * **duplicates:**
    * 'raise'：如果箱子边缘不唯一，报错（默认）
    * 'drop'：如果箱子边缘不唯一，删除重复箱
    
* **returns：**
    * 每个样本属于哪个箱子（Series）
    * 箱子上下界（Array）

In [402]:
# 一个实例
df_1 = pd.DataFrame(np.array([np.random.randint(1,100) for i in range(1000)]).reshape(100,10), 
                    columns=['col_{}'.format(i) for i in range(1,11)],
                    index=['index_{}'.format(j) for j in range(1,101)])
df_1.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
index_1,29,71,92,22,74,53,69,42,2,57
index_2,86,76,15,50,1,28,46,90,18,38
index_3,94,54,12,9,77,65,97,8,78,71
index_4,99,85,21,46,34,99,12,98,61,7
index_5,91,19,23,35,53,92,77,20,62,4


In [403]:
bins_for_col_5 = pd.qcut(df_1.loc[:, 'col_5'], q=4)
bins_for_col_5.value_counts()

(52.0, 76.0]      26
(22.75, 52.0]     25
(0.999, 22.75]    25
(76.0, 98.0]      24
Name: col_5, dtype: int64

#### 2. 等距分箱

**参考：**https://www.cnblogs.com/sench/p/10128216.html

```python
pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True)
```
> * **x：**被切分的类数组（array-like）数据，必须是1维的（不能用DataFrame）；
> * **bins：**被切割后的区间（或者叫“桶”、“箱”、“面元”），有3中形式：一个int型的标量、标量序列（数组）或者pandas.IntervalIndex；
    * 一个int型的标量
        * 当bins为一个int型的标量时，代表将x平分成bins份。x的范围在每侧扩展0.1%，以包括x的最大值和最小值
    * 标量序列
        * 标量序列定义了被分割后每一个bin的区间边缘，此时x没有扩展
    * pandas.IntervalIndex
        * 定义要使用的精确区间
> * **right：**bool型参数，默认为True，表示是否包含区间右部。比如如果bins=[1,2,3]，right=True，则区间为(1,2]，(2,3]；right=False，则区间为(1,2),(2,3)；
> * **lables：**给分割后的bins打标签，比如把年龄x分割成年龄段bins后，可以给年龄段打上诸如青年、中年的标签。labels的长度必须和划分后的区间长度相等，比如bins=[1,2,3]，划分后有2个区间(1,2]，(2,3]，则labels的长度必须为2。如果指定labels=False，则返回x中的数据在第几个bin中（从0开始）；
> * **retbins：**bool型的参数，表示是否将分割后的bins返回，当bins为一个int型的标量时比较有用，这样可以得到划分后的区间，默认为False；
> * **precision：**保留区间小数点的位数，默认为3；
> * **include_lowest：**bool型的参数，表示区间的左边是开还是闭的，默认为false，也就是不包含区间左部（闭）；
> * **duplicates：**是否允许重复区间。有两种选择：raise：不允许，drop：允许；
> * **ordered：**如果为True，将对返回的Categorical进行排序；如果为False，则不排序；默认True；
* **returns：**
    * **out：**一个pandas.Categorical, Series或者ndarray类型的值，代表分区后x中的每个值在哪个bin（区间）中，如果指定了labels，则返回对应的label；
    * **bins：**分隔后的区间，当指定retbins为True时返回。


In [404]:
# 一个实例
df_1 = pd.DataFrame(np.array([np.random.randint(1,100) for i in range(1000)]).reshape(100,10), 
                    columns=['col_{}'.format(i) for i in range(1,11)],
                    index=['index_{}'.format(j) for j in range(1,101)])
df_1.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
index_1,71,59,56,58,59,97,77,78,79,44
index_2,59,63,14,12,7,17,30,7,41,1
index_3,21,11,94,1,56,76,54,55,60,61
index_4,69,2,81,19,14,42,23,96,33,48
index_5,39,50,78,63,79,42,57,72,67,69


In [405]:
bins_for_col_7, bins = pd.cut(df_1.loc[:, 'col_7'], [1,25,50,75,100], retbins=True)
bins_for_col_7
print('-'*100)
bins

index_1      (75, 100]
index_2       (25, 50]
index_3       (50, 75]
index_4        (1, 25]
index_5       (50, 75]
               ...    
index_96     (75, 100]
index_97      (50, 75]
index_98       (1, 25]
index_99       (1, 25]
index_100    (75, 100]
Name: col_7, Length: 100, dtype: category
Categories (4, interval[int64]): [(1, 25] < (25, 50] < (50, 75] < (75, 100]]

----------------------------------------------------------------------------------------------------


array([  1,  25,  50,  75, 100])

### （六）异常值处理

In [406]:
df_1 = pd.DataFrame(np.array([np.random.randint(0, 10) for i in range(500)]).reshape(50,10), 
                    columns=['col_{}'.format(i) for i in range(1,11)],
                    index=['index_{}'.format(j) for j in range(1,51)])

df_2 = pd.DataFrame(np.array([np.random.choice((999, 998, -999, -998)) for i in range(40)]).reshape(4,10),
                    columns=['col_{}'.format(i) for i in range(1,11)],
                    index=['index_{}'.format(j) for j in range(51,55)])

df_3 = df_1.append(df_2).sample(frac=1)


df_3.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
index_53,999,999,-998,999,998,998,-999,998,-998,-999
index_41,8,3,0,9,3,5,5,8,2,5
index_29,4,1,8,8,9,8,7,4,8,4
index_9,4,5,7,1,1,0,6,3,1,1
index_45,7,2,0,0,2,6,6,6,0,9


In [407]:
# 查看数据分布
df_3.describe()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,41.074074,3.925926,3.462963,41.537037,4.240741,3.851852,-69.074074,4.148148,-33.074074,-32.814815
std,271.275871,274.392561,274.187975,271.212005,274.326684,274.325066,265.362113,274.394593,272.176542,272.353618
min,-999.0,-999.0,-998.0,-999.0,-999.0,-999.0,-999.0,-999.0,-998.0,-999.0
25%,2.0,2.0,1.0,2.25,2.0,1.25,3.0,2.0,1.0,1.0
50%,5.0,4.0,3.0,5.5,5.0,3.5,5.5,5.0,4.0,5.0
75%,7.0,7.0,7.0,8.0,7.75,7.0,7.0,7.0,7.0,7.0
max,999.0,999.0,998.0,999.0,999.0,999.0,9.0,999.0,998.0,998.0


In [408]:
# 显示异常值
df_3[(np.abs(df_3) > 100).any(axis=1)]

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
index_53,999,999,-998,999,998,998,-999,998,-998,-999
index_54,-999,999,-998,998,-999,-998,-998,-999,-998,-999
index_52,998,-998,998,-999,-998,-999,-999,999,-998,-998
index_51,999,-999,998,999,999,999,-998,-999,998,998


**非零判断**

##### df.any( )

```python
DataFrame.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs)
```
非零判断。只要有一个非零，就返回True；只有全部为零时，才返回False。

* **参数：**
> * **axis：** 按照指定轴进行判断。1为columns轴；0为index轴；
> * **bool_only：** 仅包含bool列；
> * **skipna：** 排除NA /空值。skipna=True，则将NA视为零；如果skipna=False，则将NA视为非零；
> * **level：** 针对折叠轴
> * ** **kwargs：**

* **返回值：**
> * 如果指定了level，则返回DataFrame；否则，返回Series。

##### df.all( )

**链接：**
```python
DataFrame.all(axis=0, bool_only=None, skipna=True, level=None, **kwargs)
```
非零判断。全为非零，返回True；只要有一个零，返回False。

##### any() 与 all() 对比

In [409]:
# any() 与 all() 对比
df_w_1 = pd.DataFrame(([0,0,0,1],
            [0,0,0,0]))
df_w_1

df_w_1.any(axis=1)
print('-'*10)
df_w_1.all(axis=1)

Unnamed: 0,0,1,2,3
0,0,0,0,1
1,0,0,0,0


0     True
1    False
dtype: bool

----------


0    False
1    False
dtype: bool

### （七）随机抽样

In [410]:
df_1 = pd.DataFrame(np.array(np.arange(25).reshape(5,5)))
df_1

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


#### 1. np.random.permutation()

In [411]:
np.random.permutation(df_1)  # 无放回抽样，不改变原始数据，返回新的数组

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [ 5,  6,  7,  8,  9],
       [20, 21, 22, 23, 24],
       [15, 16, 17, 18, 19]])

#### 2. np.random.shuffle()

In [412]:
df_2 = df_1.copy()

In [413]:
np.random.shuffle(df_2)  # 有放回抽样，直接改变原始数据，无返回值
df_2

Unnamed: 0,0,1,2,3,4
0,0,1,1,0,1
1,5,6,6,5,6
2,10,11,11,10,11
3,15,16,16,15,16
4,20,21,21,20,21


#### 3. df.sample()

In [414]:
df_1.sample(frac=0.6)  # frac=0.6 抽样比例0.6

Unnamed: 0,0,1,2,3,4
1,5,6,7,8,9
4,20,21,22,23,24
0,0,1,2,3,4


#### 4. np.random.choice()

```python
numpy.random.choice(a, size=None, replace=True, p=None)
```
> * **a：**一维待采样序列
> * **size：**采样样本数目
> * **replace：**是否重复采样
> * **p：**序列中每个位置的采样概率，需要与a等长

In [415]:
np.random.choice((1, 2, 3, 4), size=2, replace=True, p=[0.1, 0.2, 0.3, 0.4])

array([3, 1])

#### 5. random.sample()

```python
random.sample(N, m)
```
> * **N：**需要采样的序列
> * **m：**样的样本个数

In [416]:
import random
random.sample((1, 2, 3, 4), 2)

[1, 4]

### （八）虚拟变量转换

#### 1. 单类别转换

```python
pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
```
* **参数：**
> * **data:** 要进行转换的数据，可以是array-like, Series, DataFrame
> * **prefix:** 给生成的虚拟变量列名加前缀，可以是str, list of str, dict of str, default None
> * **prefix_sep:** 前缀连接符号，str, default ‘_’
> * **dummy_na:** 添加一列标识NaN，如果为False，则忽视NaN，bool, default False
> * **columns:** 指定需要实现类别转换的列名，如果不传入，则默认全部列进行类别转换，list-like, default None
> * **sparse:** 伪编码列是否应由SparseArray（True）或常规NumPy数组（False）支持，bool, default False
> * **drop_first:** 获得k中的k-1个类别值，去除第一个，bool, default False
> * **dtype:** 指定新数据的数据类型，dtype, default np.uint8
 
* **返回值：**
> 编码后的数据


In [417]:
df_1 = pd.DataFrame({'姓名': np.random.choice(('张三', '李四', '王五', '小明', '狗蛋'), 50).tolist(),
                     '性别': np.random.choice(('男', '女', np.nan), 50).tolist(),
                     '年龄': np.random.choice((range(1, 80, 7)), 50).tolist(),
                     '月收入': np.random.choice(range(5000, 12000, 100), 50).tolist()})  # array.tolist() 将array转化为list
df_1.head()

Unnamed: 0,姓名,性别,年龄,月收入
0,张三,男,71,7700
1,张三,,22,5400
2,张三,,36,8600
3,王五,,64,8600
4,王五,,43,5800


In [418]:
df_new = pd.get_dummies(df_1['性别'], prefix='gender',prefix_sep='_', dummy_na=True, drop_first=True)
df_new.head()

Unnamed: 0,gender_女,gender_男,gender_nan
0,0,1,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


**pandas.get_dummies 与 pd.cut() 的联合使用**

In [419]:
series = np.array([np.random.randint(1, 20) for i in range(20)])
series

pd.get_dummies(pd.cut(series, [0, 5, 10, 15, 20]))

array([ 4, 12,  6, 15, 18, 12, 18,  4,  3,  4,  2, 16, 10,  4, 13,  2,  7,
        8, 19,  2])

Unnamed: 0,"(0, 5]","(5, 10]","(10, 15]","(15, 20]"
0,1,0,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,0,1
5,0,0,1,0
6,0,0,0,1
7,1,0,0,0
8,1,0,0,0
9,1,0,0,0


#### 2. 多类别转换

In [420]:
# 导入电影数据
movies = pd.read_table('.\\data_for_book\\chapter_07\\movies.dat', sep='::', 
                        header=None, names=['movie_id', 'title', 'genres'])
movies.head(10)

  return read_csv(**locals())


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [421]:
# 提取所有的genres（题材）
# 自己实现的代码
'''
movies_genres = movies['genres'].str.split('|')

all_genres = []
for genres_list in movies_genres:
    for genres in genres_list:
        all_genres.append(genres)
        
all_genres = pd.unique(all_genres)

all_genres
'''

"\nmovies_genres = movies['genres'].str.split('|')\n\nall_genres = []\nfor genres_list in movies_genres:\n    for genres in genres_list:\n        all_genres.append(genres)\n        \nall_genres = pd.unique(all_genres)\n\nall_genres\n"

---
---
***tips:*** append() 与 extend()的区别

In [422]:
# append()
list_append = ['a', 'ab', 'abc']
list_add = ['abcd']
list_append.append(list_add)
list_append

['a', 'ab', 'abc', ['abcd']]

In [423]:
# extend()
list_extend = ['a', 'ab', 'abc']
list_add = ['abcd']
list_extend.extend(list_add)
list_extend

['a', 'ab', 'abc', 'abcd']

---
---

In [424]:
# 提取所有的genres（题材）
all_genres = []

for genres in movies['genres']:
    all_genres.extend(genres.split('|'))

all_genres = pd.unique(all_genres)

all_genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [425]:
# 建一个虚拟变量表结构
zero = np.zeros((len(movies), len(all_genres)))
dummies = pd.DataFrame(zero, columns=all_genres)
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [426]:
for i, gen in enumerate(movies['genres']):
    # 获得gen中每个genre在dummies中对应的的列索引值
    col_num = dummies.columns.get_indexer(gen.split('|'))
    # 将每一行的gen中的genre在dummies中对应列上打标为1，其他打标为0
    dummies.iloc[i, col_num] = 1

dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
---
***tips:*** enumerate() 
```python
enumerate(sequence, [start=0])
```
* **参数：**
> * **sequence:** 一个序列、迭代器或其他支持迭代对象；
> * **start:** 下标起始位置

* **返回值：**
> * 返回 enumerate(枚举) 对象

In [1]:
list_1 = ['a', 'b', 'c', 'd']
list(enumerate(list_1))
list(enumerate(list_1,start=1))

[(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]

In [5]:
# 一个例子
df_1 = pd.DataFrame(np.arange(1, 10).reshape(3, 3),
             index=['a', 'b', 'c'],
             columns=['x', 'y', 'z'])
df_1

def func(df, x, y):
    for i, v in enumerate(df_1['x']):
        if v == 4:
            df_1[y][i] = 999
        else:
            df_1[y][i] = 666
        
func(df_1,x='x',y='y')

df_1

Unnamed: 0,x,y,z
a,1,2,3
b,4,5,6
c,7,8,9


Unnamed: 0,x,y,z
a,1,666,3
b,4,999,6
c,7,666,9


---
---

In [428]:
# 合并
movies_add_dummies = movies.join(dummies.add_prefix('Genre_'))
movies_add_dummies.head()

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 三、字符串操作

### （一） python内建字符串方法

In [429]:
a = "** I love you, I love python more **"

In [430]:
# 计数  
a.count('I')

2

In [431]:
# 求字符串长度
len(a)

36

In [432]:
# 移除
a.strip("**")  # 两侧移除
a.lstrip("** ")  # 左侧移除
a.rstrip(" **")  # 右侧移除

' I love you, I love python more '

'I love you, I love python more **'

'** I love you, I love python more'

In [433]:
# 切分
a.split(" ")  # 切分出来时列表，可以用join()拼合成字符串
# result = a.split(" ")
# " ".join(result)
a.rsplit(" ")  # rsplit() 从右切分，split()是从左切分

['**', 'I', 'love', 'you,', 'I', 'love', 'python', 'more', '**']

['**', 'I', 'love', 'you,', 'I', 'love', 'python', 'more', '**']

In [434]:
# join()拼接
result = a.split(" ")
result
" ".join(result)

['**', 'I', 'love', 'you,', 'I', 'love', 'python', 'more', '**']

'** I love you, I love python more **'

In [435]:
# 成员运算 in 和 not in
" " in a
" " not in a

True

False

In [436]:
# 大小写转换
a.upper()
a.lower()
a.title()

'** I LOVE YOU, I LOVE PYTHON MORE **'

'** i love you, i love python more **'

'** I Love You, I Love Python More **'

In [437]:
# startswith()和endswith()
a.startswith("*")  # 判断以"*"开始
a.endswith("*")  # 判断以"*"结尾

True

True

In [438]:
# 替换
a.replace("*", "-")  # 用"-"替换"*"

'-- I love you, I love python more --'

In [439]:
# isdigit() 判断字符串里是否全是数字
b = "1 a"
b.isdigit()
c = "1"
c.isdigit()

False

True

In [440]:
# 查找索引
a.index("I")

3

### （二）正则表达式
* **参考1：** https://github.com/ziishaned/learn-regex/blob/master/translations/README-cn.md （英语原版）
* **参考2：** https://github.com/cdoco/learn-regex-zh （翻译较好的版本）

#### 1. 常用特殊字符和符号

##### 1.1 元字符

|元字符|描述|
|:-:|:-|
|  .  |句号匹配任意单个字符除了换行符|
|  [ ]  |字符种类。匹配方括号内的任意字符|
|  [^ ]  |否定的字符种类。匹配除了括号里的任意字符|
|  *  |匹配 * 号前的第一个字符或者字符集出现 >= 0 次的字符串|
|  +  |匹配 + 号前的第一个字符或者字符集出现 >= 1 次的字符串|
|  ?  |匹配 ?号前的第一个字符或者字符集出现 0 或 1 次的字符串|
|  {n,m}  |匹配 {n,m} 前面的第一个字符或者字符集出现 >= n, <= m 次的字符串|
|  (xyz)  |字符集，匹配与 xyz 完全相等的字符串|
|  \|  |或运算符，匹配符号前或后的字符|
|  \  |转义字符,用于匹配一些保留的字符 [ ] ( ) { } . * + ? ^ $ \ \||
|  ^  |检查字符串是否以^号后面的单字符或字符集开头|
|  \$  |检查字符串是否以\$号前面的单字符或字符集结尾|

**1.1.1 点运算符```.```**  

* ```.```匹配任意单个字符（除了换行符）。

In [441]:
str_1 = 'The car parked in the garage.'

re.findall('.ar', str_1)

['car', 'par', 'gar']

**1.1.2 字符集```[ ]```**

* 匹配方括号内的任意字符，在方括号中的字符集不关心顺序。

In [442]:
str_1 = 'The car parked in the garage.'
re.findall('[cp]ar', str_1)
re.findall('[gp]ar', str_1)

['car', 'par']

['par', 'gar']

**1.1.3 否定字符集```[^ ]```**

* 匹配除了方括号里的任意字符。

In [443]:
str_1 = 'The car parked in the garage.'
re.findall('[^g]ar', str_1)

['car', 'par']

**1.1.4 ```*```号**

* 匹配 ``` * ``` 号前的第一个字符或者字符集出现 >= 0 次的字符串。

In [444]:
str_1 = 'abc  bc a_a_a_abc abbbc'
re.findall('ab*', str_1)
re.findall('abb*', str_1)

['ab', 'a', 'a', 'a', 'ab', 'abbb']

['ab', 'ab', 'abbb']

**```.*```搭配使用**
* 可以匹配所有的字符。

In [445]:
str_1 = 'abc  bcd a_a_a_abc abbbc'
re.findall('a.*d', str_1)

['abc  bcd']

**1.1.5 ```+```号**

* 匹配 ```+``` 号前的第一个字符或者字符集出现 >= 1 次的字符串。

In [446]:
str_1 = 'abc  bc a_a_a_abcc abbbc'
re.findall('ab+', str_1)
re.findall('abb+', str_1)

['ab', 'ab', 'abbb']

['abbb']

**1.1.6 ```?```号**

* 匹配 ```?```号前的第一个字符或者字符集出现 0 或 1 次的字符串。

In [447]:
# 1
str_1 = 'abc  bc a_a_a_abcc abbbc'
re.findall('abb?', str_1)
re.findall('abb*', str_1)

['ab', 'ab', 'abb']

['ab', 'ab', 'abbb']

In [448]:
# 2
str_1 = 'abc  bc a_a_a_abcc abbc'
re.findall('ab[bc]?', str_1)

['abc', 'abc', 'abb']

**1.1.7 ```{n,m}```**

* 匹配 {n,m} 前面的第一个字符或者字符集出现 >= n, <= m 次的字符串

In [449]:
str_1 = 'The number was 9.9997 but we rounded it off to 10.0.'
re.findall('[0-9]{2,3}', str_1)

['999', '10']

**1.1.8 ```( )```**

* ```( )``` 中包含的内容将会被看成一个整体。例如, 表达式 ```(ab) *``` 匹配连续出现 0 或更多个 ab。如果没有使用 ```( )``` ，那么表达式 ```ab *``` 将匹配连续出现 0 或更多个 b 。

In [450]:
str_1 = 'abk acbk ababbk abbksdf dfgsfabbbkdlfll'
re.findall('(ab)*bk', str_1)
re.findall('ab*bk', str_1)

['', '', 'ab', 'ab', '']

['abk', 'abbk', 'abbk', 'abbbk']

**1.1.9 ```|``` 或运算符**
* 匹配符号前或后的字符。

In [451]:
str_1 = 'The car is parked in the garage.'
re.findall('the|car', str_1)

['car', 'the']

**1.1.10```\``` 转义字符**

* 用于转码紧跟其后的字符。如果想要匹配```{ } [ ] / \ + * . $ ^ | ?```这些特殊字符则要在其前面加上反斜线 ```\```。

In [452]:
str_1 = 'i love you......, do you ... love me?'
re.findall('\.+', str_1)

['......', '...']

**1.1.11 ```^```号**
* 检查字符串是否以```^```号后面的单字符或字符集开头。
(The caret symbol ```^``` is used to check if a matching character is the first character of the input string.)

In [453]:
str_1 = 'The car is parked in the garage'
re.findall('^(T|t)he', str_1)

['T']

**1.1.12 ```$```号**
* 检查字符串是否以```$```号前面的单字符或字符集结尾。(The dollar sign ```$``` is used to check if a matching character is the last character in the string. )

In [454]:
str_1 = 'The car is parked in the garage'
re.findall('e$', str_1)

['e']

##### 1.2 简写字符集

|简写|描述|
|:-:|:-|
| .|除换行符外的所有字符|
| \w |匹配所有字母数字，等同于 [a-zA-Z0-9_]|
| \W |匹配所有非字母数字，即符号，等同于： [^\w]|
| \d |匹配数字： [0-9]|
| \D |匹配非数字： [^\d]|
| \s |匹配所有空格字符，等同于： [\t\n\f\r\p{Z}]|
| \S |匹配所有非空格字符： [^\s]|
| \f |匹配一个换页符|
| \n |匹配一个换行符|
| \r |匹配一个回车符|
| \t |匹配一个制表符| 
| \v |匹配一个垂直制表符|
| \p |匹配 CR/LF（等同于 \r\n），用来匹配 DOS 行终止符|

**1.2.1 点运算符```.```**  

* ```.```匹配任意单个字符（除了换行符）。

In [455]:
str_1 = 'The car parked in the garage.'

re.findall('.ar', str_1)

['car', 'par', 'gar']

**1.2.2 ```\w```**
* 匹配所有字母数字，等同于 ```[a-zA-Z0-9_]```

In [456]:
str_1 = '????? **** i love yoU 1000 遍'
re.findall('\w', str_1)

['i', 'l', 'o', 'v', 'e', 'y', 'o', 'U', '1', '0', '0', '0', '遍']

**1.2.3 ```\W```**
* 匹配所有非字母数字，即符号，等同于： ```[^\w]```

In [457]:
str_1 = '????? **** i love yoU 1000 遍'
re.findall('\W', str_1) 

['?', '?', '?', '?', '?', ' ', '*', '*', '*', '*', ' ', ' ', ' ', ' ', ' ']

**1.2.4 ```\d```**
* 匹配数字： ```[0-9]```

In [458]:
str_1 = '????? **** i love yoU 1000 遍'
re.findall('\d', str_1) 

['1', '0', '0', '0']

**1.2.5 ```\D```**
* 匹配非数字： ```[^\d]```

In [459]:
str_1 = '????? **** i love yoU 1000 遍'
re.findall('\D', str_1) 

['?',
 '?',
 '?',
 '?',
 '?',
 ' ',
 '*',
 '*',
 '*',
 '*',
 ' ',
 'i',
 ' ',
 'l',
 'o',
 'v',
 'e',
 ' ',
 'y',
 'o',
 'U',
 ' ',
 ' ',
 '遍']

**1.2.6 ```\s```**
* 匹配所有空格字符，等同于： ```[\t\n\f\r\p{Z}]```

In [460]:
str_1 = '????? **** i love yoU 1000 遍'
re.findall('\s', str_1) 

[' ', ' ', ' ', ' ', ' ', ' ']

**1.2.7 ```\S```**
* 匹配所有非空格字符： ```[^\s]```

In [461]:
str_1 = '????? **** i love yoU 1000 遍'
re.findall('\S', str_1) 

['?',
 '?',
 '?',
 '?',
 '?',
 '*',
 '*',
 '*',
 '*',
 'i',
 'l',
 'o',
 'v',
 'e',
 'y',
 'o',
 'U',
 '1',
 '0',
 '0',
 '0',
 '遍']

**1.2.8 ```\f```**
* 匹配一个换页符

**1.2.9 ```\n```**
* 匹配一个换行符

**1.2.10 ```\r```**
* 匹配一个回车符

**1.2.11 ```\t```**
* 匹配一个制表符

**1.2.12 ```\v```**
* 匹配一个垂直制表符

**1.2.13 ```\p```**
* 匹配 CR/LF（等同于 \r\n），用来匹配 DOS 行终止符

##### 1.3 Lookarounds

Lookbehinds and lookaheads (also called lookarounds) are specific types of non-capturing groups (used to match a pattern but without including it in the matching list). Lookarounds are used when we a pattern must be preceded or followed by another pattern. For example, imagine we want to get all numbers that are preceded by the \$ character  from the string ```$4.44 and $10.88```. We will use the following regular expression ```(?<=\$)[0-9\.]* ``` which means: get all the numbers which contain the ```.``` character and are preceded by the ```$``` character. These are the lookarounds that are used in regular expressions:

|字符|描述|
|:-:|:-|
| ?= |跟随存在(Positive Lookahead)|
| ?! |跟随排除(Negative Lookahead)|
| ?<= |打头存在(Positive Lookbehind)|
| ?<! |打头排除(Negative Lookbehind)|

**1.3.1 ```?=```**
* The positive lookahead asserts that the first part of the expression must be **followed** by the lookahead expression. 

In [462]:
str_1 = '萝卜 1.20元 一斤，辣椒 1.50元 一斤，西红柿1.30'
re.findall('\d\.\d*(?=元)', str_1)
re.findall('\d\.\d*', str_1)

['1.20', '1.50']

['1.20', '1.50', '1.30']

**1.3.2 ```?!```**
* Negative lookaheads are used when we need to get all matches from an input string that are **not followed** by a certain pattern.

In [463]:
str_1 = '萝卜 1.20元 一斤，辣椒 1.50元 一斤，西红柿1.30'
re.findall('\d\.\d*(?!元)', str_1)
re.findall('\d\.\d*', str_1)

['1.2', '1.5', '1.30']

['1.20', '1.50', '1.30']

**1.3.3 ```?<=```**
* Positive lookbehinds are used to get all the matches that are **preceded** by a specific pattern. 

In [464]:
str_1 = '萝卜 ￥1.20 一斤，辣椒 ￥1.50 一斤，西红柿1.30'
re.findall('(?<=￥)\d*\.\d*', str_1)
re.findall('\d\.\d*', str_1)

['1.20', '1.50']

['1.20', '1.50', '1.30']

**1.3.4 ```?<!```**
* Negative lookbehinds are used to get all the matches that are **not preceded** by a specific pattern.

In [465]:
str_1 = '萝卜 ￥1.20 一斤，辣椒 ￥1.50 一斤，西红柿1.30'
re.findall('(?<!￥)\d*\.\d*', str_1)
re.findall('\d\.\d*', str_1)

['.20', '.50', '1.30']

['1.20', '1.50', '1.30']

#### 2. re模块

##### 2.1 re.findall()
```python
re.findall(pattern, string, flags=0)
```
* **pattern:** 正则表达式；
* **string:** 目标文本；
* **flags:** 匹配模式标志
> * re.I 忽略大小写
> * re.L
> * re.M
> * re.S 使 ```.``` 完全匹配任何字符，包括换行；没有这个标志， ```.``` 匹配除了换行符外的任何字符
> * re.X

In [466]:
# 例子
str_1 = '''"renxfSwitch":"1",
            "snBookPath":"http://snbook.suning.com",
            "itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

re.findall('price.*\d*\.\d*', str_1, flags=re.I)[0].replace('"', '')

'Price:22.80'

##### 2.2 re.match()
```python
re.match(pattern, string, flags=0)
```
If zero or more characters at **the beginning of** string match the regular expression pattern, return a corresponding match object. Return None if the string does not match the pattern; note that this is different from a zero-length match.

In [467]:
# 例子
str_1 = '''"itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

str_2 = '''"isbnCode":"",
"itemPrice":"22.80",
            "cookieDomain": ".suning.com", 
 '''

re.match('.*price.*\d*\.\d*', str_1, flags=re.I)
re.match('.*price.*\d*\.\d*', str_2, flags=re.I)

<re.Match object; span=(0, 18), match='"itemPrice":"22.80'>

##### 2.3 re.search()
```python
re.search(pattern, string, flags=0)
```
**Scan through** string looking for the first location where the regular expression pattern produces a match, and return a corresponding match object. Return None if no position in the string matches the pattern; note that this is different from finding a zero-length match at some point in the string.

In [468]:
# 例子
str_1 = '''"itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

str_2 = '''"isbnCode":"",
"itemPrice":"22.80",
            "cookieDomain": ".suning.com", 
 '''

re.search('.*price.*\d*\.\d*', str_1, flags=re.I)
re.search('.*price.*\d*\.\d*', str_2, flags=re.I)

<re.Match object; span=(0, 18), match='"itemPrice":"22.80'>

<re.Match object; span=(15, 33), match='"itemPrice":"22.80'>

##### 2.4 re.fullmatch()
```python
re.fullmatch(pattern, string, flags=0)
```
If the **whole string** matches the regular expression pattern, return a corresponding match object. Return None if the string does not match the pattern; note that this is different from a zero-length match.

In [469]:
# 例子
str_1 = '''"itemPrice":"22.80",
            "itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

str_2 = '''"itemPrice":"22.80"
'''

print('我是第一个fullmatch')
re.fullmatch('.*price.*\d*\.\d*.', str_1, flags=re.I)

print('我是第二个fullmatch')
re.fullmatch('.*', str_2, flags=re.S)  # 完全匹配string

print('我是match')
re.match('.*price.*\d*\.\d*', str_1, flags=re.I)

我是第一个fullmatch
我是第二个fullmatch


<re.Match object; span=(0, 20), match='"itemPrice":"22.80"\n'>

我是match


<re.Match object; span=(0, 18), match='"itemPrice":"22.80'>

##### 2.5 re.finditer()
```python
re.finditer(pattern, string, flags=0)
```
Return an **iterator** yielding match objects over all non-overlapping matches for the RE pattern in string. The string is scanned **left-to-right**, and matches are returned in the order found. Empty matches are included in the result.

In [470]:
str_1 = '''"itemPrice":"22.80",
            "itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

list(re.finditer('.*price.*\d*\.\d*.', str_1, flags=re.I))

[<re.Match object; span=(0, 19), match='"itemPrice":"22.80"'>,
 <re.Match object; span=(21, 52), match='            "itemPrice":"22.80"'>]

##### 2.6 re.sub()
```python
re.sub(pattern, repl, string, count=0, flags=0)
```
* **pattern:** 正则表达式；
* **repl:** 替换后的字符串；
* **string:** 目标文本；
* **count:** 替换个数。默认为0，表示每个匹配项都替换；
* **flags:** 匹配模式标志。

In [471]:
str_1 = '''"itemPrice":"22.80",
            "itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

re.sub('.*price.*\d*\.\d*.', '-------', str_1, flags=re.I)

'-------,\n-------,\n            "isbnCode":"",\n            "cookieDomain": ".suning.com", \n '

##### 2.7 re.split()
```python
re.split(pattern, string, maxsplit=0, flags=0)
```
* **pattern:** 正则表达式；
* **string:** 目标文本；
* **maxsplit:** 分割的最大次数。默认值为0，表示分割次数无限制，能分几次分几次；取负数，表示不分割；若大于0，表示最多分割maxsplit次；
* **flags:** 匹配模式标志。

In [472]:
str_1 = '''"itemPrice":"22.80",
            "itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''
re.split('\s+', str_1)
re.split('\s+', str_1, maxsplit=3)

['"itemPrice":"22.80",',
 '"itemPrice":"22.80",',
 '"isbnCode":"",',
 '"cookieDomain":',
 '".suning.com",',
 '']

['"itemPrice":"22.80",',
 '"itemPrice":"22.80",',
 '"isbnCode":"",',
 '"cookieDomain": ".suning.com", \n ']

##### 2.8 re.compile()
```python
re.compile(pattern, flags=0)
```
Compile a regular expression pattern into a regular expression object, which can be used for matching using its match(), search() and other methods. （把正则表达式编译成一个正则表达式对象。）

In [475]:
str_1 = '''"itemPrice":"22.80",
            "itemPrice":"22.80",
            "isbnCode":"",
            "cookieDomain": ".suning.com", 
 '''

re_1 = re.compile('.*price.*\d*\.\d*.', re.I)

re_1.findall(str_1)

['"itemPrice":"22.80"', '            "itemPrice":"22.80"']

### （三）pandas中的向量化字符串函数

In [529]:
names = np.random.choice(('__王二狗', '**狗蛋', '小明', '张__三???', '李**四__', '333王多余' ), 20)
ages = np.random.choice((13, 25, 34, 47, 58, 79), 20)
genders =  np.random.choice(('男', '女'), 20)

df_1 = pd.DataFrame({'姓名': list(names), '年龄': list(ages), '性别': list(genders)},
                    index=['index_{}'.format(i) for i in range(1,21)])

df_1.head()

Unnamed: 0,姓名,年龄,性别
index_1,**狗蛋,13,女
index_2,李**四__,47,男
index_3,__王二狗,47,女
index_4,小明,58,女
index_5,**狗蛋,79,男


#### 1. Series.str.contains()

In [530]:
df_1['姓名'].str.contains('.*张.*三.*')
df_1[df_1['姓名'].str.contains('.*张.*三.*')]

index_1     False
index_2     False
index_3     False
index_4     False
index_5     False
index_6     False
index_7      True
index_8     False
index_9     False
index_10    False
index_11    False
index_12    False
index_13    False
index_14    False
index_15    False
index_16    False
index_17    False
index_18    False
index_19     True
index_20    False
Name: 姓名, dtype: bool

Unnamed: 0,姓名,年龄,性别
index_7,张__三???,79,女
index_19,张__三???,79,男


##### 2. Series.str.match()

In [531]:
df_1['姓名'].str.match('.*张.*三.*')
df_1[df_1['姓名'].str.match('.*张.*三.*')]

index_1     False
index_2     False
index_3     False
index_4     False
index_5     False
index_6     False
index_7      True
index_8     False
index_9     False
index_10    False
index_11    False
index_12    False
index_13    False
index_14    False
index_15    False
index_16    False
index_17    False
index_18    False
index_19     True
index_20    False
Name: 姓名, dtype: bool

Unnamed: 0,姓名,年龄,性别
index_7,张__三???,79,女
index_19,张__三???,79,男


##### 3. Series.str.count()

In [537]:
df_1['姓名'].str.count('.*张.*三.*|.*狗.*蛋.*')
df_1[df_1['姓名'].str.count('.*张.*三.*|.*狗.*蛋.*')>= 1]

index_1     1
index_2     0
index_3     0
index_4     0
index_5     1
index_6     1
index_7     1
index_8     0
index_9     0
index_10    0
index_11    0
index_12    0
index_13    0
index_14    0
index_15    0
index_16    0
index_17    0
index_18    0
index_19    1
index_20    0
Name: 姓名, dtype: int64

Unnamed: 0,姓名,年龄,性别
index_1,**狗蛋,13,女
index_5,**狗蛋,79,男
index_6,**狗蛋,58,男
index_7,张__三???,79,女
index_19,张__三???,79,男


##### 4. Series.str.get(i)
获取df中的字符串中索引为i的元素

In [538]:
df_1['姓名'].str.get(0)

index_1     *
index_2     李
index_3     _
index_4     小
index_5     *
index_6     *
index_7     张
index_8     3
index_9     小
index_10    李
index_11    李
index_12    3
index_13    李
index_14    3
index_15    李
index_16    _
index_17    李
index_18    小
index_19    张
index_20    _
Name: 姓名, dtype: object

#### 5. Series.str.cat()

In [542]:
df_2 = df_1.copy()

df_2.head()

df_2['姓名'] = df_2['姓名'].str.replace('[_|*|?|\d]', '')

df_2.head()

df_2['姓名'].str.cat(df_1['性别'], sep='_')

Unnamed: 0,姓名,年龄,性别
index_1,**狗蛋,13,女
index_2,李**四__,47,男
index_3,__王二狗,47,女
index_4,小明,58,女
index_5,**狗蛋,79,男


Unnamed: 0,姓名,年龄,性别
index_1,狗蛋,13,女
index_2,李四,47,男
index_3,王二狗,47,女
index_4,小明,58,女
index_5,狗蛋,79,男


index_1      狗蛋_女
index_2      李四_男
index_3     王二狗_女
index_4      小明_女
index_5      狗蛋_男
index_6      狗蛋_男
index_7      张三_女
index_8     王多余_男
index_9      小明_女
index_10     李四_男
index_11     李四_女
index_12    王多余_男
index_13     李四_女
index_14    王多余_男
index_15     李四_女
index_16    王二狗_女
index_17     李四_男
index_18     小明_女
index_19     张三_男
index_20    王二狗_男
Name: 姓名, dtype: object

**更多函数请参考官方文档或者《利用python进行数据分析》P214-215**