In [2]:
import os
from urllib.request import urlretrieve
import numpy as np
import pandas as pd

%matplotlib inline

In [8]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/'

def maybe_download(filename):
    file_csv = os.path.splitext(filename)[0] + '.csv'
    if(not os.path.exists(file_csv)):
        urlretrieve(url + filename, filename)
        os.rename(filename, os.path.splitext(filename)[0] + '.csv')
    return file_csv
    
diabetes_csv = maybe_download('pima-indians-diabetes.data')
diabetes = pd.read_csv(diabetes_csv, header=None)

In [23]:
diabetes.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [24]:
diabetes.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


### 统计每一列中丢失值的数量

In [11]:
print((diabetes[[1, 2, 3, 4, 5]] == 0).sum())

1      5
2     35
3    227
4    374
5     11
dtype: int64


从上面的统计可以看出，第1，2，5列只有几个零值，而第3列和第4列几乎一半行都是零值。
在Pandas, Numpy和sklearn中，我们将将丢失值标记为NaN。那么sum，count等操作中NaN的值将会被忽略。
通过pandas dataframe的`replace()`函数，可以将丢失值标记为NaN。
标记了丢失值后，可以使用`isnull()`函数将数据中所有的NaN值标记为真，并获取每列丢失值的计数。

In [21]:
diabetes[[1,2,3,4,5]] = diabetes[[1,2,3,4,5]].replace(0, np.NaN)
diabetes.isnull().sum()

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64

In [22]:
diabetes.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# split diabetes data into inputs and outputs
values = diabetes.values
X = values[:, 0:8]
y = values[:, 8]
