### 处理丢失数据
- 有两种丢失数据：
    - None
    - np.nan(NaN)

- 两种丢失数据的区别

In [43]:
import numpy as np
import pandas as pd
from pandas import DataFrame

- 为什么在数据分析中需要用到的是浮点类型的空而不是对象类型？
    - 数据分析中会常常使用某些形式的运算来处理原始数据，如果原数数据中的空值为NAN的形式，则不会干扰或者中断运算。
    - NAN可以参与运算的
    - None是不可以参与运算

In [44]:
type(None),type(np.nan)

(NoneType, float)

- 在pandas中如果遇到了None形式的空值则pandas会将其强转成NAN的形式。

In [45]:
df = DataFrame(data=np.random.randint(0,100,size=(6,7)))
df.iloc[2,3] = None
df.iloc[4,2] = np.nan
df.iloc[5,1] = None
df

Unnamed: 0,0,1,2,3,4,5,6
0,70,52.0,38.0,3.0,11,22,66
1,89,82.0,37.0,18.0,72,86,86
2,71,83.0,80.0,,63,20,38
3,15,73.0,80.0,96.0,97,65,49
4,44,39.0,,34.0,32,58,83
5,77,,33.0,47.0,5,74,21


### pandas处理空值操作
- isnull
- notnull
- any
- all
- dropna
- fillna

- 方式1：对空值进行过滤（删除空所在的行数据）
    - 技术：isnull，notnull，any，all

In [46]:
df.isnull()

Unnamed: 0,0,1,2,3,4,5,6
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False
5,False,True,False,False,False,False,False


In [47]:
df.isnull().any(axis=1) #any可以检测df中的true和false的分布，如果行/列中只要存在一个true，则any就会返回true

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool

In [48]:
df.loc[~df.isnull().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6
0,70,52.0,38.0,3.0,11,22,66
1,89,82.0,37.0,18.0,72,86,86
3,15,73.0,80.0,96.0,97,65,49


In [49]:
#notnull
df.notnull()

Unnamed: 0,0,1,2,3,4,5,6
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,False,True,True,True
3,True,True,True,True,True,True,True
4,True,True,False,True,True,True,True
5,True,False,True,True,True,True,True


In [50]:
df.notnull().all(axis=1) #all可以检测df中的true和false的分布，如果行/列中全部都是true，则any就会返回true，否则返回false

0     True
1     True
2    False
3     True
4    False
5    False
dtype: bool

In [51]:
df.loc[df.notnull().all(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6
0,70,52.0,38.0,3.0,11,22,66
1,89,82.0,37.0,18.0,72,86,86
3,15,73.0,80.0,96.0,97,65,49


- 方式2：
    - dropna：可以直接将缺失的行或者列进行删除

In [52]:
df.dropna(axis=0) #drop系列的函数中0行1列

Unnamed: 0,0,1,2,3,4,5,6
0,70,52.0,38.0,3.0,11,22,66
1,89,82.0,37.0,18.0,72,86,86
3,15,73.0,80.0,96.0,97,65,49


- 对缺失值进行覆盖
    - fillna

In [53]:
df.fillna(value=666)

Unnamed: 0,0,1,2,3,4,5,6
0,70,52.0,38.0,3.0,11,22,66
1,89,82.0,37.0,18.0,72,86,86
2,71,83.0,80.0,666.0,63,20,38
3,15,73.0,80.0,96.0,97,65,49
4,44,39.0,666.0,34.0,32,58,83
5,77,666.0,33.0,47.0,5,74,21


In [54]:
df.fillna(axis=1,method='bfill')

Unnamed: 0,0,1,2,3,4,5,6
0,70.0,52.0,38.0,3.0,11.0,22.0,66.0
1,89.0,82.0,37.0,18.0,72.0,86.0,86.0
2,71.0,83.0,80.0,63.0,63.0,20.0,38.0
3,15.0,73.0,80.0,96.0,97.0,65.0,49.0
4,44.0,39.0,34.0,34.0,32.0,58.0,83.0
5,77.0,33.0,33.0,47.0,5.0,74.0,21.0


### 面试题
- 数据说明： 
    - 数据是1个冷库的温度数据，1-7对应7个温度采集设备，1分钟采集一次。

- 数据处理目标：
    - 用1-4对应的4个必须设备，通过建立冷库的温度场关系模型，预估出5-7对应的数据。
    - 最后每个冷库中仅需放置4个设备，取代放置7个设备。
    - f(1-4) --> y(5-7)

- 数据处理过程：
    - 1、原始数据中有丢帧现象，需要做预处理；
    - 2、matplotlib 绘图；
    - 3、建立逻辑回归模型。

- 无标准答案，按个人理解操作即可，请把自己的操作过程以文字形式简单描述一下，谢谢配合。

- 测试数据为testData.xlsx


In [55]:
df = pd.read_excel('data/testData.xlsx')
df

  warn(msg)


Unnamed: 0,time,none,1,2,3,4,none1,5,6,7
0,2019-01-27 17:00:00,,-24.8,-18.2,-20.8,-18.8,,,,
1,2019-01-27 17:01:00,,-23.5,-18.8,-20.5,-19.8,,-15.2,-14.5,-16.0
2,2019-01-27 17:02:00,,-23.2,-19.2,,,,-13.0,,-14.0
3,2019-01-27 17:03:00,,-22.8,-19.2,-20.0,-20.5,,,-12.2,-9.8
4,2019-01-27 17:04:00,,-23.2,-18.5,-20.0,-18.8,,-10.2,-10.8,-8.8
...,...,...,...,...,...,...,...,...,...,...
1055,2019-01-28 10:35:00,,-26.2,-27.2,-28.8,-27.5,,-2.0,,-5.0
1056,2019-01-28 10:36:00,,-26.8,-27.5,-29.0,-27.8,,-2.2,,-5.0
1057,2019-01-28 10:37:00,,-27.2,-27.8,-29.0,-28.0,,-2.2,,-5.0
1058,2019-01-28 10:38:00,,-27.5,-27.0,-29.0,-28.0,,-3.5,-3.2,-5.8


In [56]:
df.drop(labels=['none','none1'],axis=1,inplace=True)

In [57]:
#存在缺失数据的行数
df.isnull().any(axis=1).sum()

133

In [58]:
df.dropna(axis=0)

Unnamed: 0,time,1,2,3,4,5,6,7
1,2019-01-27 17:01:00,-23.5,-18.8,-20.5,-19.8,-15.2,-14.5,-16.0
4,2019-01-27 17:04:00,-23.2,-18.5,-20.0,-18.8,-10.2,-10.8,-8.8
7,2019-01-27 17:07:00,-24.8,-18.0,-17.5,-17.2,-14.2,-14.0,-12.5
10,2019-01-27 17:10:00,-24.5,-18.5,-16.0,-18.5,-17.5,-16.5,-17.2
15,2019-01-27 17:15:00,-23.5,-17.8,-15.0,-18.0,10.5,10.5,10.8
...,...,...,...,...,...,...,...,...
1051,2019-01-28 10:31:00,-24.0,-24.8,-27.8,-25.5,-2.0,-2.0,-5.8
1052,2019-01-28 10:32:00,-24.2,-25.5,-28.0,-26.0,-2.0,-2.0,-5.5
1053,2019-01-28 10:33:00,-25.0,-26.2,-28.2,-26.8,-2.0,-2.0,-5.2
1054,2019-01-28 10:34:00,-25.8,-26.8,-28.5,-27.0,-2.0,-2.2,-5.2


In [59]:
#填充
df.fillna(method='ffill',axis=0).fillna(method='bfill',axis=0)

Unnamed: 0,time,1,2,3,4,5,6,7
0,2019-01-27 17:00:00,-24.8,-18.2,-20.8,-18.8,-15.2,-14.5,-16.0
1,2019-01-27 17:01:00,-23.5,-18.8,-20.5,-19.8,-15.2,-14.5,-16.0
2,2019-01-27 17:02:00,-23.2,-19.2,-20.5,-19.8,-13.0,-14.5,-14.0
3,2019-01-27 17:03:00,-22.8,-19.2,-20.0,-20.5,-13.0,-12.2,-9.8
4,2019-01-27 17:04:00,-23.2,-18.5,-20.0,-18.8,-10.2,-10.8,-8.8
...,...,...,...,...,...,...,...,...
1055,2019-01-28 10:35:00,-26.2,-27.2,-28.8,-27.5,-2.0,-2.2,-5.0
1056,2019-01-28 10:36:00,-26.8,-27.5,-29.0,-27.8,-2.2,-2.2,-5.0
1057,2019-01-28 10:37:00,-27.2,-27.8,-29.0,-28.0,-2.2,-2.2,-5.0
1058,2019-01-28 10:38:00,-27.5,-27.0,-29.0,-28.0,-3.5,-3.2,-5.8


- 使用列的均值填充缺失值

In [60]:
df = DataFrame(data=np.random.randint(0,100,size=(6,7)))
df.iloc[2,3] = None
df.iloc[4,2] = np.nan
df.iloc[5,1] = None
df

Unnamed: 0,0,1,2,3,4,5,6
0,19,18.0,83.0,66.0,12,33,1
1,46,44.0,52.0,84.0,6,40,54
2,92,15.0,39.0,,84,25,36
3,72,17.0,48.0,89.0,73,27,23
4,22,6.0,,3.0,52,22,18
5,47,,20.0,98.0,84,96,81


In [61]:
#原始数据有三个空值
df.isnull().any(axis=1).sum()

3

In [62]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        #df[col]列中存在空值
        mean_value = df[col].mean()
        df[col].fillna(value=mean_value,inplace=True)

In [63]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,19,18.0,83.0,66.0,12,33,1
1,46,44.0,52.0,84.0,6,40,54
2,92,15.0,39.0,68.0,84,25,36
3,72,17.0,48.0,89.0,73,27,23
4,22,6.0,48.4,3.0,52,22,18
5,47,20.0,20.0,98.0,84,96,81


### 处理重复数据（行）

In [64]:
df = DataFrame(data=np.random.randint(0,100,size=(7,4)))
df.iloc[1] = [1,1,1,1]
df.iloc[3] = [1,1,1,1]
df.iloc[4] = [1,1,1,1]
df

Unnamed: 0,0,1,2,3
0,32,81,76,91
1,1,1,1,1
2,37,42,10,44
3,1,1,1,1
4,1,1,1,1
5,15,58,59,85
6,71,59,72,60


- 基于duplicated和drop进行删除重复行操作

In [65]:
df.duplicated() #检测哪些行是重复的数据

0    False
1    False
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [66]:
drop_index = df.loc[df.duplicated()].index
drop_index

Int64Index([3, 4], dtype='int64')

In [67]:
df.drop(labels=drop_index,axis=0)

Unnamed: 0,0,1,2,3
0,32,81,76,91
1,1,1,1,1
2,37,42,10,44
5,15,58,59,85
6,71,59,72,60


- 简便方式

In [68]:
df.drop_duplicates()

Unnamed: 0,0,1,2,3
0,32,81,76,91
1,1,1,1,1
2,37,42,10,44
5,15,58,59,85
6,71,59,72,60


### 处理异常数据
- 自定义一个1000行3列（A，B，C）取值范围为0-1的数据源，然后将C列中的值大于其两倍标准差的异常值进行清洗

In [69]:
df = DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,0.606702,0.965904,0.073289
1,0.267004,0.297271,0.103582
2,0.730568,0.354576,0.363972
3,0.261358,0.246406,0.244674
4,0.530297,0.729153,0.716680
...,...,...,...
995,0.932048,0.164134,0.116227
996,0.742805,0.064808,0.412816
997,0.896580,0.095089,0.001909
998,0.748352,0.599906,0.460407


In [70]:
std_twice = df['C'].std() * 2
std_twice

0.5882374354034698

In [71]:
df['C'] > std_twice

0      False
1      False
2      False
3      False
4       True
       ...  
995    False
996    False
997    False
998    False
999    False
Name: C, Length: 1000, dtype: bool

In [72]:
df.loc[df['C'] > std_twice]

Unnamed: 0,A,B,C
4,0.530297,0.729153,0.716680
9,0.488935,0.315732,0.861962
11,0.072647,0.957808,0.765391
12,0.035336,0.203981,0.940993
13,0.082529,0.027153,0.949178
...,...,...,...
989,0.549878,0.327064,0.868919
990,0.017534,0.798372,0.708233
991,0.808820,0.282274,0.914877
992,0.143062,0.245427,0.903166


In [73]:
drop_index = df.loc[df['C'] > std_twice].index
df.drop(labels=drop_index,axis=0,inplace=True)

In [74]:
df

Unnamed: 0,A,B,C
0,0.606702,0.965904,0.073289
1,0.267004,0.297271,0.103582
2,0.730568,0.354576,0.363972
3,0.261358,0.246406,0.244674
5,0.281030,0.897674,0.362563
...,...,...,...
995,0.932048,0.164134,0.116227
996,0.742805,0.064808,0.412816
997,0.896580,0.095089,0.001909
998,0.748352,0.599906,0.460407
