In [1]:
# read the data -- from https://www.kaggle.com/saurabh00007/diabetescsv/data
import pandas as pd

df = pd.read_csv('diabetes.csv')
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [2]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [3]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [4]:
# train test split
from sklearn.model_selection import train_test_split

seed = 7
test_size = 0.2
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [5]:
# impute missing values
# strategy也可以是median或者mean
# 补齐缺失值，提升Accuracy，如果没有这一步，Accuracy只有70%多
# 有关axis参数的解释，参见imputer的sklearn链接：http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# If axis=0, then impute along columns. If axis=1, then impute along rows.
# 一般肯定是按列取mean、median或者most frequent来补齐Missing value了，肯定不会按行
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy = 'most_frequent', axis=0)
train_x_imputed = imputer.fit_transform(train_x)
test_x_imputed = imputer.fit_transform(test_x)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier()
clf.fit(train_x, train_y)
hyp = clf.predict(test_x)
print ('Accuracy: %0.4f' % (accuracy_score(test_y, hyp)))

Accuracy: 0.7468


In [7]:
# install xgboost, very popular on Kaggle!!!
# conda install -c conda-forge xgboost
# 但是对于Win-64 Anaconda-3的user来说，安装xgboost需要用下述命令conda install -c anaconda py-xgboost
from xgboost import XGBClassifier

clf = XGBClassifier()
clf.fit(train_x_imputed, train_y)
hyp = clf.predict(test_x_imputed)
print ('Accuracy: %0.4f' % (accuracy_score(test_y, hyp)))



Accuracy: 0.8117
