# 数据清洗与准备

In [14]:
import pandas as pd
import numpy as np
import re

## 1. 缺失值处理

![微信图片_20190509094048.png](https://i.loli.net/2019/05/09/5cd385377bcd7.png)

** Series **

In [2]:
ser = pd.Series([1,np.nan,3.5,np.nan,7])
ser

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [3]:
# isnull()函数
ser.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [7]:
#过滤缺失值,两种等价方式：dropna()和布尔索引
ser1 = ser.dropna()
ser1

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
ser2 = ser[ser.notnull()]
ser2

0    1.0
2    3.5
4    7.0
dtype: float64

** DataFrame **

In [12]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = np.nan
df.iloc[:2,2] = np.nan
df.iloc[1,0] = np.nan
df

Unnamed: 0,0,1,2
0,-0.576749,,
1,,,
2,0.85103,,-0.166812
3,-0.381627,,-0.284251
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


** 删除缺失值，dropna()函数 **

- dropna()函数默认会删除所有包含缺失值的行

In [14]:
df.dropna()  

Unnamed: 0,0,1,2
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


- 通过参数 how = 'all'删除所有值均为NA的行

- 通过 axis 参数 可以控制删除列数据，原理同行处理

In [17]:
df.dropna(how = 'all')  

Unnamed: 0,0,1,2
0,-0.576749,,
2,0.85103,,-0.166812
3,-0.381627,,-0.284251
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


- 通过thresh参数可以控制删除缺失值的数目

In [22]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,0.85103,,-0.166812
3,-0.381627,,-0.284251
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


** 补全缺失值，通过 fillna（） 函数 **  

fillna的参数

![微信图片_20190509094057.png](https://i.loli.net/2019/05/09/5cd3853785663.png)

In [23]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.576749,0.0,0.0
1,0.0,0.0,0.0
2,0.85103,0.0,-0.166812
3,-0.381627,0.0,-0.284251
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


In [25]:
df.fillna({0:6,1:5,2:2})

Unnamed: 0,0,1,2
0,-0.576749,5.0,2.0
1,6.0,5.0,2.0
2,0.85103,5.0,-0.166812
3,-0.381627,5.0,-0.284251
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


In [26]:
# method 参数 可传入 'ffill' - 向前填充，'bfill' - 后向填充

In [33]:
df.fillna(method = 'bfill')

Unnamed: 0,0,1,2
0,-0.576749,0.124854,-0.166812
1,0.85103,0.124854,-0.166812
2,0.85103,0.124854,-0.166812
3,-0.381627,0.124854,-0.284251
4,0.352118,0.124854,0.161858
5,0.514492,-0.122187,0.175378
6,-0.177364,-0.332114,1.055039


## 2.数据转换

### 2.1 删除重复值

- df.duplicate()
- df.drop_duplicates()

In [35]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [36]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [37]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [39]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [40]:
data.drop_duplicates(['k1','k2'])

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


## 2.2 替代值

- df.replace()


In [43]:
ser = pd.Series([1, -99, 3, -99,-100, 3])

In [44]:
ser.replace(-99, np.nan)

0      1.0
1      NaN
2      3.0
3      NaN
4   -100.0
5      3.0
dtype: float64

In [45]:
ser.replace([-99,-100],[6,8])

0    1
1    6
2    3
3    6
4    8
5    3
dtype: int64

In [46]:
ser.replace({-99:np.nan,-100:0})

0    1.0
1    NaN
2    3.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 2.3 离散化和分箱

- pd.cut()

In [61]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] #连续样本值
bins = [18, 25, 35, 60, 100] #离散的区间阈值
cats = pd.cut(ages, bins)  
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [62]:
cats.categories #类别区间

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [63]:
cats.codes # 样本所属的类别的区间

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [64]:
pd.value_counts(cats) #每个区间的样本数

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [67]:
group_names = ['young','adult','med','senior']
pd.cut(ages,[18,26,36,61,100],right = False,labels = group_names )  #right控制区间闭合，'labels'给区间种类命名

[young, young, young, adult, young, ..., adult, senior, med, med, adult]
Length: 12
Categories (4, object): [young < adult < med < senior]

In [68]:
#给cut传递整数，均匀切割划分区间
data = np.random.rand(20)
pd.cut(data,4,precision = 2)

[(0.72, 0.94], (0.28, 0.5], (0.72, 0.94], (0.5, 0.72], (0.72, 0.94], ..., (0.28, 0.5], (0.5, 0.72], (0.5, 0.72], (0.72, 0.94], (0.28, 0.5]]
Length: 20
Categories (4, interval[float64]): [(0.058, 0.28] < (0.28, 0.5] < (0.5, 0.72] < (0.72, 0.94]]

- qcut() , 基于样本分位点进行分箱，获得等长的箱

In [77]:
data = np.random.randn(1000) #正态分布
cats = pd.qcut(data,4)
cats

[(0.675, 2.893], (-0.00855, 0.675], (-0.667, -0.00855], (-0.00855, 0.675], (-0.00855, 0.675], ..., (-3.659, -0.667], (-0.00855, 0.675], (-0.00855, 0.675], (0.675, 2.893], (0.675, 2.893]]
Length: 1000
Categories (4, interval[float64]): [(-3.659, -0.667] < (-0.667, -0.00855] < (-0.00855, 0.675] < (0.675, 2.893]]

In [78]:
pd.value_counts(cats)

(0.675, 2.893]        250
(-0.00855, 0.675]     250
(-0.667, -0.00855]    250
(-3.659, -0.667]      250
dtype: int64

### 2.4 计算指标、虚拟变量

In [4]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data':range(2,8)})
df

Unnamed: 0,data,key
0,2,b
1,3,b
2,4,a
3,5,c
4,6,a
5,7,b


In [8]:
dumies = pd.get_dummies(df['key'])
dumies

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [10]:
df[['data']].join(dumies)

Unnamed: 0,data,a,b,c
0,2,0,1,0
1,3,0,1,0
2,4,1,0,0
3,5,0,0,1
4,6,1,0,0
5,7,0,1,0


## 3.字符串操作

### 3.1字符串对象方法

![微信图片_20190509143633.png](https://i.loli.net/2019/05/09/5cd3ca8579ef8.png)

![微信图片_20190509143637.png](https://i.loli.net/2019/05/09/5cd3ca8577a29.png)

字符串可以使用** split ** 方法拆分成多块，变成字符串数组

In [4]:
val = 'a,b,do'
pieces = val.split(',')
pieces

['a', 'b', 'do']

** split ** 常常和 ** strip **一起使用，用于清除空格/换行符

**join**方法可以将字符串以某种分隔符为间隔串联起来

In [5]:
':'.join(pieces)

'a:b:do'

### 3.2 正则表达式

In [13]:
(1)                 
\d匹配一个数字   eg.3,2,5
\s匹配一个空字符  eg. ' ','\t'
\w匹配一个字母或数字
. 可以匹配任意字符

(2)匹配变长的字符
*表示可以匹配任意个字符（包括0个）
+表示可以匹配至少一个字符
？表示可以匹配零个或一个字符

(3)
{m,n} 可以匹配n-m个字符
[]表示一个范围
[0-9a-zA-Z\_]可以匹配一个数字、字母或者下划线；
[0-9a-zA-Z\_]+可以匹配至少由一个数字、字母或者下划线组成的字符串
        比如'a100'，'0_Z'，'Py3000'等等；
[a-zA-Z\_][0-9a-zA-Z\_]*可以匹配由字母或下划线开头，后接任意个由一个数字、字母或者下划线组成的字符串，也就是Python合法的变量；
[a-zA-Z\_][0-9a-zA-Z\_]{0, 19}更精确地限制了变量的长度是1-20个字符（前面1个字符+后面最多19个字符）

(4)
A|B可以匹配A或B，所以(P|p)ython可以匹配'Python'或者'python'
^表示行的开头,^\d表示必须以数字开头
$表示行的结束，\d$表示必须以数字结束


SyntaxError: unexpected character after line continuation character (<ipython-input-13-131861daedc0>, line 2)

![微信图片_20190509145115.png](https://i.loli.net/2019/05/09/5cd3cdee6ba65.png)

re模块的三个主题：匹配、替代、拆分。

匹配

- match

In [30]:
re.match(r'^\d{3}\-\d{3,8}$', '010-12345')

<_sre.SRE_Match object; span=(0, 9), match='010-12345'>

切分

- split

In [31]:
'a b   c  e'.split(' ')

['a', 'b', '', '', 'c', '', 'e']

In [32]:
re.split(r'\s+','a b   c  e')

['a', 'b', 'c', 'e']

查找、切分

- findall
- search
- groups

In [37]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

regex = re.compile(pattern,flags=re.IGNORECASE)

In [39]:
re.findall(regex,text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [40]:
re.search(regex,text)

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

除了简单地判断是否匹配之外，正则表达式还有提取子串的强大功能。用()表示的就是要提取的分组（Group）

In [52]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'   #通过（）将正则表达式分块
regex = re.compile(pattern, flags=re.IGNORECASE)

In [51]:
group = regex.match('dave@google.com').groups()
print(group)
for st in group:
    print(st)

('dave', 'google', 'com')
dave
google
com


In [46]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]